Exploring Data

The data that I am using for this project can be accessed from https://www.kaggle.com/datasets/shashwatwork/municipal-waste-management-cost-prediction?resource=download

waste_data <- read.csv("~/Desktop/Machine Learning/public_data_waste_fee.csv")
str(waste_data)
## 'data.frame':    4341 obs. of  39 variables:
##  $ region    : chr  "Emilia_Romagna" "Emilia_Romagna" "Emilia_Romagna" "Emilia_Romagna" ...
##  $ province  : chr  "Ferrara" "Ferrara" "Ferrara" "Ferrara" ...
##  $ name      : chr  "Comacchio" "Lagosanto" "Goro" "Mesola" ...
##  $ tc        : num  502 228 268 199 234 ...
##  $ cres      : num  129.3 49.5 50.6 41.1 58.3 ...
##  $ csor      : num  66.4 44.1 44.6 40.4 26 ...
##  $ istat     : int  38006 38011 38025 38014 110005 38010 38030 58120 27005 8050 ...
##  $ area      : num  283.8 34.4 26.6 84.3 35.7 ...
##  $ pop       : int  22648 4952 3895 7140 12193 3003 7364 67626 11793 2861 ...
##  $ alt       : int  1 1 1 1 1 1 1 1 1 2 ...
##  $ isle      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ sea       : int  1 1 1 1 1 0 0 1 1 1 ...
##  $ pden      : num  79.8 143.8 146.3 84.7 341.5 ...
##  $ wden      : num  119671 70031 81117 43320 201565 ...
##  $ urb       : int  2 3 3 3 2 3 3 2 2 2 ...
##  $ fee       : chr  "PAYT" "PAYT" "PAYT" "PAYT" ...
##  $ d_fee     : int  1 1 1 1 0 1 1 0 0 0 ...
##  $ sample    : int  1 1 1 1 0 1 1 0 1 0 ...
##  $ organic   : num  NA 35.041 37.377 45.31 0.428 ...
##  $ paper     : num  4.36 9.89 11.99 9.76 6.6 ...
##  $ glass     : num  3.59 9.52 6.65 7.55 4.33 ...
##  $ wood      : num  2.27 4.00 1.32e-05 1.49e-01 2.30 ...
##  $ metal     : num  0.462 1.861 0.745 0.747 0.103 ...
##  $ plastic   : num  1.13 4.64 5.22 5.2 5.12 ...
##  $ raee      : num  0.346 1.609 NA NA 0.275 ...
##  $ texile    : num  0.112 0.351 0.449 0.4 0.287 ...
##  $ other     : num  3.2 9.02 16.04 9.77 4.05 ...
##  $ msw_so    : num  20396261 1831407 1694922 2881055 3026700 ...
##  $ msw_un    : int  13560520 580460 464400 770860 4169180 349620 556540 7895520 5659520 296480 ...
##  $ msw       : int  33956781 2411867 2159322 3651915 7195880 1682628 3336429 33435410 15175582 1553789 ...
##  $ sor       : num  60.1 75.9 78.5 78.9 42.1 ...
##  $ geo       : num  3 3 3 3 1 3 NA 2 3 3 ...
##  $ roads     : num  285 11 49 165 60 65 NA 329 77 17 ...
##  $ s_wteregio: num  33.11 33.11 33.11 33.11 4.05 ...
##  $ s_landfill: num  15.2 15.2 15.2 15.2 45.4 ...
##  $ gdp       : num  7.27 7.11 7.27 7.09 7.25 ...
##  $ proads    : num  4.35 6.08 4.34 3.71 5.27 ...
##  $ wage      : num  9.44 9.51 8.89 9.43 9.13 ...
##  $ finance   : num  7.49 7.32 7.49 7.3 7.46 ...
names(waste_data)
##  [1] "region"     "province"   "name"       "tc"         "cres"      
##  [6] "csor"       "istat"      "area"       "pop"        "alt"       
## [11] "isle"       "sea"        "pden"       "wden"       "urb"       
## [16] "fee"        "d_fee"      "sample"     "organic"    "paper"     
## [21] "glass"      "wood"       "metal"      "plastic"    "raee"      
## [26] "texile"     "other"      "msw_so"     "msw_un"     "msw"       
## [31] "sor"        "geo"        "roads"      "s_wteregio" "s_landfill"
## [36] "gdp"        "proads"     "wage"       "finance"
sum(is.na(waste_data))
## [1] 6372
summary(as.factor(waste_data$region))
##               Abruzzo            Basilicata              Calabria 
##                   155                    69                   165 
##              Campania        Emilia_Romagna Friuli_Venezia_Giulia 
##                   405                   308                   183 
##                 Lazio               Liguria             Lombardia 
##                   188                   179                  1230 
##                Marche                Molise              piemonte 
##                   103                    70                   114 
##                Puglia              Sardegna               Sicilia 
##                   126                   121                   212 
##               Toscana   Trentino_Alto_Adige                Umbria 
##                   231                   157                    59 
##         Valle_d'Aosta                Veneto 
##                     1                   265

While this data comes in relatively clean, there are a few important issues that need to be addressed. Specifically, removing the NA values and replaces them with 0. As well as converts the certain variable columns to a factor if it is not already one. Factors are used to represent categorical data in R. Each unique value in the column is treated as a level of the factor.

waste_data[is.na(waste_data)] <- 0
summary(waste_data)
##     region            province             name                 tc        
##  Length:4341        Length:4341        Length:4341        Min.   : 25.69  
##  Class :character   Class :character   Class :character   1st Qu.:108.04  
##  Mode  :character   Mode  :character   Mode  :character   Median :136.62  
##                                                           Mean   :154.24  
##                                                           3rd Qu.:179.16  
##                                                           Max.   :977.42  
##       cres             csor            istat             area        
##  Min.   :  0.00   Min.   :  0.00   Min.   :  1272   Min.   :   0.00  
##  1st Qu.: 26.94   1st Qu.: 30.04   1st Qu.: 18135   1st Qu.:  10.81  
##  Median : 41.46   Median : 48.56   Median : 42015   Median :  22.71  
##  Mean   : 53.53   Mean   : 51.87   Mean   : 47470   Mean   :  40.94  
##  3rd Qu.: 65.98   3rd Qu.: 66.03   3rd Qu.: 70049   3rd Qu.:  47.45  
##  Max.   :670.32   Max.   :582.16   Max.   :111107   Max.   :1287.39  
##       pop               alt              isle               sea        
##  Min.   :     34   Min.   :   0.0   Min.   :0.000000   Min.   :0.0000  
##  1st Qu.:   1579   1st Qu.:  79.0   1st Qu.:0.000000   1st Qu.:0.0000  
##  Median :   3535   Median : 239.0   Median :0.000000   Median :0.0000  
##  Mean   :  10204   Mean   : 309.6   Mean   :0.005068   Mean   :0.1682  
##  3rd Qu.:   8199   3rd Qu.: 459.0   3rd Qu.:0.000000   3rd Qu.:0.0000  
##  Max.   :2617175   Max.   :1816.0   Max.   :1.000000   Max.   :1.0000  
##       pden              wden              urb            fee           
##  Min.   :    0.0   Min.   :      0   Min.   :0.000   Length:4341       
##  1st Qu.:   62.0   1st Qu.:  23441   1st Qu.:2.000   Class :character  
##  Median :  150.8   Median :  68255   Median :3.000   Mode  :character  
##  Mean   :  404.5   Mean   : 191792   Mean   :2.487                     
##  3rd Qu.:  399.0   3rd Qu.: 194281   3rd Qu.:3.000                     
##  Max.   :12122.8   Max.   :4978556   Max.   :3.000                     
##      d_fee            sample          organic           paper       
##  Min.   :0.0000   Min.   :0.0000   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.: 4.675   1st Qu.: 8.614  
##  Median :0.0000   Median :1.0000   Median :22.630   Median :10.856  
##  Mean   :0.1283   Mean   :0.5469   Mean   :19.648   Mean   :10.898  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:30.907   3rd Qu.:13.051  
##  Max.   :1.0000   Max.   :1.0000   Max.   :61.639   Max.   :45.288  
##      glass             wood            metal            plastic      
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.0000   Min.   : 0.000  
##  1st Qu.: 7.085   1st Qu.: 0.000   1st Qu.: 0.7456   1st Qu.: 4.069  
##  Median : 9.071   Median : 2.692   Median : 1.4697   Median : 5.762  
##  Mean   : 9.335   Mean   : 3.076   Mean   : 1.6644   Mean   : 6.057  
##  3rd Qu.:11.257   3rd Qu.: 5.080   3rd Qu.: 2.2983   3rd Qu.: 7.506  
##  Max.   :39.836   Max.   :25.117   Max.   :20.6715   Max.   :31.605  
##       raee             texile              other            msw_so         
##  Min.   : 0.0000   Min.   : 0.000000   Min.   : 0.000   Min.   :        0  
##  1st Qu.: 0.6493   1st Qu.: 0.000053   1st Qu.: 3.606   1st Qu.:   373965  
##  Median : 1.1196   Median : 0.457738   Median : 6.887   Median :  1040737  
##  Mean   : 1.1439   Mean   : 0.580364   Mean   : 7.693   Mean   :  3248581  
##  3rd Qu.: 1.5332   3rd Qu.: 0.854879   3rd Qu.:11.002   3rd Qu.:  2725645  
##  Max.   :17.9536   Max.   :10.584472   Max.   :37.156   Max.   :765130099  
##      msw_un               msw                 sor             geo      
##  Min.   :     6185   Min.   :1.997e+04   Min.   : 0.25   Min.   :0.00  
##  1st Qu.:   175180   1st Qu.:6.117e+05   1st Qu.:57.83   1st Qu.:1.00  
##  Median :   409060   Median :1.524e+06   Median :70.84   Median :3.00  
##  Mean   :  2042522   Mean   :5.311e+06   Mean   :66.24   Mean   :2.14  
##  3rd Qu.:  1056920   3rd Qu.:3.954e+06   3rd Qu.:79.09   3rd Qu.:3.00  
##  Max.   :926757220   Max.   :1.692e+09   Max.   :97.48   Max.   :3.00  
##      roads            s_wteregio       s_landfill          gdp        
##  Min.   :    0.00   Min.   : 0.000   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.:   19.00   1st Qu.: 5.634   1st Qu.: 4.551   1st Qu.: 6.725  
##  Median :   45.00   Median :18.540   Median :11.286   Median : 7.060  
##  Mean   :   91.53   Mean   :20.432   Mean   :18.688   Mean   : 6.588  
##  3rd Qu.:   96.00   3rd Qu.:38.501   3rd Qu.:31.493   3rd Qu.: 7.511  
##  Max.   :14970.00   Max.   :65.122   Max.   :92.532   Max.   :10.539  
##      proads            wage           finance      
##  Min.   :-3.367   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 3.076   1st Qu.: 9.143   1st Qu.: 6.926  
##  Median : 4.191   Median : 9.499   Median : 7.272  
##  Mean   : 3.820   Mean   : 8.827   Mean   : 6.786  
##  3rd Qu.: 4.997   3rd Qu.: 9.654   3rd Qu.: 7.736  
##  Max.   : 8.980   Max.   :10.485   Max.   :10.855
summary(as.factor(waste_data$province))
##               Bergamo               Brescia                Milano 
##                   218                   172                   131 
##               Bolzano               Salerno                  Como 
##                   115                   112                   106 
##               Cremona                 Udine                Varese 
##                   102                   102                    99 
##               Vicenza                 Pavia                 Lecco 
##                    97                    94                    80 
##                Napoli              Avellino               Caserta 
##                    80                    79                    78 
##                  Roma                Chieti               Cosenza 
##                    70                    69                    69 
##                Biella               Mantova               Sondrio 
##                    64                    63                    62 
##                Padova                Savona             Benevento 
##                    60                    59                    56 
##               Bologna Monza e della Brianza               Potenza 
##                    54                    54                    54 
##               Imperia             Pordenone            Campobasso 
##                    52                    50                    49 
##               Catania                  Lodi                Genova 
##                    49                    49                    48 
##                 Lecce                Novara              Piacenza 
##                    47                    46                    46 
##               Palermo       Pesaro e Urbino             Frosinone 
##                    45                    45                    44 
##               Venezia                Verona               Messina 
##                    44                    43                    42 
##                 Parma    Reggio nell'Emilia                Trento 
##                    42                    42                    42 
##               Firenze              L'Aquila                Modena 
##                    40                    37                    36 
##                  Pisa          Sud Sardegna               Perugia 
##                    36                    36                    35 
##               Viterbo               Sassari               Pescara 
##                    35                    34                    31 
##    Reggio di Calabria         Forli'-Cesena                 Nuoro 
##                    31                    30                    30 
##             Catanzaro                 Lucca                 Fermo 
##                    29                    29                    28 
##                Arezzo              Grosseto               Gorizia 
##                    27                    26                    25 
##         Vibo Valentia                  Bari                 Terni 
##                    25                    24                    24 
##                Foggia             Agrigento               Isernia 
##                    23                    21                    21 
##                Latina               Belluno               Ferrara 
##                    21                    20                    20 
##             La Spezia               Pistoia                Rimini 
##                    20                    20                    20 
##               Trapani               Ravenna                 Rieti 
##                    19                    18                    18 
##                 Siena               Taranto                Teramo 
##                    18                    18                    18 
##               Livorno              Siracusa                Ancona 
##                    17                    17                    16 
##              Cagliari                Matera         Ascoli Piceno 
##                    16                    15                    14 
##         Massa-Carrara               Crotone                  Enna 
##                    12                    11                     8 
##                Ragusa Barletta-Andria-Trani              Brindisi 
##                     8                     7                     7 
##                 Prato               Trieste              Oristano 
##                     6                     6                     5 
##         Caltanissetta                  Asti                 Aosta 
##                     3                     2                     1 
##               (Other) 
##                     3
summary(as.factor(waste_data$name))
##                                          Castro                  Peglio 
##                       6                       2                       2 
##             Abano Terme         Abbadia Lariana              Abbateggio 
##                       1                       1                       1 
##           Abbiategrasso      Abetone Cutigliano                   Acate 
##                       1                       1                       1 
##                  Acerra          Aci Bonaccorsi            Aci Castello 
##                       1                       1                       1 
##                Acireale             Acquafredda              Acqualagna 
##                       1                       1                       1 
##   Acquanegra sul Chiese             Acquasparta     Acquaviva d'Isernia 
##                       1                       1                       1 
##   Acquaviva delle Fonti              Acquedolci                    Acri 
##                       1                       1                       1 
##                   Acuto                 Adelfia                  Adrano 
##                       1                       1                       1 
##      Adrara San Martino        Adrara San Rocco                    Adro 
##                       1                       1                       1 
##                    Affi                  Affile                Agazzano 
##                       1                       1                       1 
##                 Agerola                 Agliana                    Agna 
##                       1                       1                       1 
##               Agnadello                Agnosine          Agrate Brianza 
##                       1                       1                       1 
##        Agrate Conturbia               Agrigento                Agropoli 
##                       1                       1                       1 
##               Agugliano                Aicurzio                  Aielli 
##                       1                       1                       1 
##          Aiello Calabro       Aiello del Friuli       Aiello del Sabato 
##                       1                       1                       1 
##                  Ailano                 Ailoche                  Airola 
##                       1                       1                       1 
##                  Airole                  Airuno                     Ala 
##                       1                       1                       1 
##           Alà dei Sardi                  Alanno          Alano di Piave 
##                       1                       1                       1 
##                 Alassio               Albairate          Albano Laziale 
##                       1                       1                       1 
##  Albano Sant'Alessandro      Albaredo Arnaboldi        Albaredo d'Adige 
##                       1                       1                       1 
##  Albaredo per San Marco                Albareto                 Albenga 
##                       1                       1                       1 
##             Alberobello               Albettone                 Albiate 
##                       1                       1                       1 
##             Albignasego                 Albinea                  Albino 
##                       1                       1                       1 
##                 Albiolo      Albisola Superiore        Albissola Marina 
##                       1                       1                       1 
##               Albizzate              Albosaggia               Albuzzano 
##                       1                       1                       1 
##                  Alcamo           Aldino/Aldein Alessandria della Rocca 
##                       1                       1                       1 
##                Alessano                  Alezio                  Alfano 
##                       1                       1                       1 
##                Alfedena               Alfonsine                 Alghero 
##                       1                       1                       1 
##                   Algua                Allerona                 Alliste 
##                       1                       1                       1 
##               Allumiere                    Almè  Almenno San Bartolomeo 
##                       1                       1                       1 
##   Almenno San Salvatore                  Alonte                  Alseno 
##                       1                       1                       1 
##         Alta Val Tidone      Alta Valle Intelvi                  Altare 
##                       1                       1                       1 
##        Altavilla Irpina     Altavilla Silentina     Altavilla Vicentina 
##                       1                       1                       1 
##                 (Other) 
##                    4235
summary(as.factor(waste_data$fee))
##     PAYT STANDARD 
##      557     3784

Visualizations

# Load necessary libraries
library(ggplot2)
library(reshape2)

# Assuming `waste_data` is your cleaned dataset in R with columns 'tc', 'cres', and 'csor'
# Reshape data to long format for ggplot
waste_long <- melt(waste_data, measure.vars = c("tc", "cres", "csor"),
                   variable.name = "Cost_Type", value.name = "Cost")

# Create smooth density plot
ggplot(waste_long, aes(x = Cost, fill = Cost_Type, color = Cost_Type)) +
  geom_density(alpha = 0.5, size = 1) +
  labs(title = "Comparison of Total, Sorted, and Residual Costs",
       x = "Cost", y = "Density", fill = "Cost Type", color = "Cost Type") +
  theme_minimal() +
  scale_fill_manual(values = c("tc" = "blue", "cres" = "green", "csor" = "red")) +
  scale_color_manual(values = c("tc" = "blue", "cres" = "green", "csor" = "red"))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Interpretations: cres = residual cost per capita csor = Sorted cost per capita

Total Costs dominate in scale compared to the other two types, evidenced by its higher density for larger cost values. Residual Costs are generally lower, with a sharp peak close to zero and minimal spread beyond that. Sorted Costs have the least density overall, implying a minor contribution to the total cost framework. The plot highlights significant differences in the scale and distribution of the three cost types. Most costs are concentrated in the lower ranges for all three categories, but Total Costs have the most variability and presence at higher values. This may suggest that while the Total Costs include broad contributions, Residual and Sorted Costs are more specific and less impactful overall.

model <- lm(tc ~ pop, data = waste_data)

# Plot with ggplot2, including the regression line

library(ggplot2)
ggplot(waste_data, aes(x = log(pop + 1), y = tc)) +
  geom_point() +
  geom_smooth(method = "lm", col = "blue") +
  labs(title = "Relationship between TC and Population",
       x = "Population",
       y = "TC (Cost)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

Interpretations: This scatterplot visualizes the relationship between TC (Cost) on the y-axis and Population on the x-axis where there is little no relationship expressed here. The trend line is nearly horizontal, showing that changes in population do not systematically affect the cost (TC). This suggests little or no correlation between the two variables

plot(waste_data$geo,waste_data$tc, xlab="geo",
     ylab="Total Cost per capita eur", main="Cost per capita eur v.s. Geo")

boxplot(waste_data$tc~waste_data$geo, xlab="geo",
     ylab="Total Cost per capita eur", main="Cost per capita eur v.s. Geo")

Interpretations: There appears to be variability in the cost per capita across the different geographical categories (represented by geo values 0, 1, 1.5, 2, and 3). Some geo categories might consistently have lower median costs, while others have higher medians with larger spreads. The presence of many outliers across geo categories suggests a wide range of costs that may not fit typical trends, requiring further analysis.

plot(waste_data$sea, waste_data$tc)

ggplot(waste_data, aes(x = factor(sea),  y= tc)) + 
  geom_boxplot()

Interpretations: sea is a categorical variable with two levels (0 and 1). It represents whether a region is related to “sea” or not (e.g., coastal vs. non-coastal). tc represents the total cost per capita in euros. Total costs are generally higher in regions associated with sea = 1. This could imply that being near the sea is associated with higher costs, potentially due to factors like additional expenses related to coastal infrastructure and higher operational costs in sea-related regions (e.g., waste management, transport logistics).

hist(waste_data$tc, col='lightblue')

Interpretation I recognize that there is a right skew I can correct this with a log transformation

waste_data$log_tc <- log(waste_data$tc+1)

hist(waste_data$log_tc, col = 'lightblue')

Linear Regression

Data Partitioning

set.seed(7)
total_obs <- dim(waste_data)[1]
# Data partition / Sample splitting
train_data_indices <- sample(1:total_obs, 0.8*total_obs)
train_data <- waste_data[train_data_indices,]
test_data <- waste_data[-train_data_indices,]
train_obs <- dim(train_data)[1]

Instead of building linear regression models on the log-scale total cost, I will build linear a regression model for the original scale of total cost, i.e. without log transformation to correct the right-skewness of cost.

colnames(train_data)
##  [1] "region"     "province"   "name"       "tc"         "cres"      
##  [6] "csor"       "istat"      "area"       "pop"        "alt"       
## [11] "isle"       "sea"        "pden"       "wden"       "urb"       
## [16] "fee"        "d_fee"      "sample"     "organic"    "paper"     
## [21] "glass"      "wood"       "metal"      "plastic"    "raee"      
## [26] "texile"     "other"      "msw_so"     "msw_un"     "msw"       
## [31] "sor"        "geo"        "roads"      "s_wteregio" "s_landfill"
## [36] "gdp"        "proads"     "wage"       "finance"    "log_tc"
#lm_full <- lm(tc ~ ., data = train_data[, -c(1:3)])

lm_full <- lm(tc~., data=train_data[, 4:(ncol(train_data)- 1)])
print(summary(lm_full))
## 
## Call:
## lm(formula = tc ~ ., data = train_data[, 4:(ncol(train_data) - 
##     1)])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -811.27  -24.39   -7.84   15.89  652.33 
## 
## Coefficients: (2 not defined because of singularities)
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.569e+01  8.613e+00   7.627 3.09e-14 ***
## cres         9.447e-01  2.167e-02  43.590  < 2e-16 ***
## csor         8.704e-01  2.588e-02  33.630  < 2e-16 ***
## istat       -9.154e-05  3.628e-05  -2.523 0.011678 *  
## area         1.202e-01  1.860e-02   6.461 1.19e-10 ***
## pop         -1.999e-04  1.363e-04  -1.466 0.142650    
## alt          3.556e-04  3.781e-03   0.094 0.925087    
## isle         6.727e+01  1.090e+01   6.171 7.56e-10 ***
## sea          1.964e+01  2.581e+00   7.609 3.54e-14 ***
## pden        -3.254e-02  5.668e-03  -5.741 1.02e-08 ***
## wden         8.092e-05  1.226e-05   6.599 4.78e-11 ***
## urb         -8.449e-01  1.914e+00  -0.442 0.658840    
## feeSTANDARD -3.519e-01  2.634e+00  -0.134 0.893736    
## d_fee               NA         NA      NA       NA    
## sample      -2.106e+00  2.911e+00  -0.724 0.469364    
## organic     -3.420e-01  8.756e-02  -3.906 9.57e-05 ***
## paper       -5.825e-01  2.572e-01  -2.265 0.023567 *  
## glass       -9.723e-01  2.531e-01  -3.842 0.000124 ***
## wood        -7.476e-01  3.902e-01  -1.916 0.055445 .  
## metal        7.028e-03  6.879e-01   0.010 0.991849    
## plastic     -2.156e-01  3.072e-01  -0.702 0.482912    
## raee         2.461e+00  1.054e+00   2.334 0.019628 *  
## texile      -1.572e+00  1.178e+00  -1.335 0.181960    
## other        1.822e-02  2.151e-01   0.085 0.932489    
## msw_so       9.516e-07  4.592e-06   0.207 0.835865    
## msw_un       6.319e-07  4.716e-06   0.134 0.893432    
## msw         -5.190e-07  4.676e-06  -0.111 0.911637    
## sor          2.535e-01  1.075e-01   2.357 0.018487 *  
## geo          1.773e+00  1.929e+00   0.919 0.358056    
## roads       -3.995e-03  2.910e-03  -1.373 0.169839    
## s_wteregio  -9.210e-02  6.967e-02  -1.322 0.186277    
## s_landfill   2.868e-02  5.534e-02   0.518 0.604298    
## gdp          4.743e+00  7.623e-01   6.223 5.48e-10 ***
## proads      -3.403e+00  8.175e-01  -4.163 3.22e-05 ***
## wage        -2.640e+00  8.190e-01  -3.223 0.001278 ** 
## finance             NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 46.1 on 3438 degrees of freedom
## Multiple R-squared:  0.6323, Adjusted R-squared:  0.6287 
## F-statistic: 179.1 on 33 and 3438 DF,  p-value: < 2.2e-16

Backward Selection with BIC

lm_bwd <- step(lm_full, direction='backward', k=log(train_obs)) 
## Start:  AIC=26844.68
## tc ~ cres + csor + istat + area + pop + alt + isle + sea + pden + 
##     wden + urb + fee + d_fee + sample + organic + paper + glass + 
##     wood + metal + plastic + raee + texile + other + msw_so + 
##     msw_un + msw + sor + geo + roads + s_wteregio + s_landfill + 
##     gdp + proads + wage + finance
## 
## 
## Step:  AIC=26844.68
## tc ~ cres + csor + istat + area + pop + alt + isle + sea + pden + 
##     wden + urb + fee + d_fee + sample + organic + paper + glass + 
##     wood + metal + plastic + raee + texile + other + msw_so + 
##     msw_un + msw + sor + geo + roads + s_wteregio + s_landfill + 
##     gdp + proads + wage
## 
## 
## Step:  AIC=26844.68
## tc ~ cres + csor + istat + area + pop + alt + isle + sea + pden + 
##     wden + urb + fee + sample + organic + paper + glass + wood + 
##     metal + plastic + raee + texile + other + msw_so + msw_un + 
##     msw + sor + geo + roads + s_wteregio + s_landfill + gdp + 
##     proads + wage
## 
##              Df Sum of Sq      RSS   AIC
## - metal       1         0  7307517 26836
## - other       1        15  7307532 26836
## - alt         1        19  7307535 26836
## - msw         1        26  7307543 26836
## - fee         1        38  7307554 26836
## - msw_un      1        38  7307555 26836
## - msw_so      1        91  7307608 26837
## - urb         1       414  7307931 26837
## - s_landfill  1       571  7308087 26837
## - plastic     1      1047  7308563 26837
## - sample      1      1113  7308629 26837
## - geo         1      1796  7309312 26837
## - s_wteregio  1      3714  7311231 26838
## - texile      1      3788  7311305 26838
## - roads       1      4007  7311523 26838
## - pop         1      4570  7312087 26839
## - wood        1      7803  7315320 26840
## - paper       1     10906  7318422 26842
## - raee        1     11584  7319100 26842
## - sor         1     11807  7319323 26842
## - istat       1     13531  7321047 26843
## <none>                     7307517 26845
## - wage        1     22086  7329602 26847
## - glass       1     31374  7338890 26851
## - organic     1     32426  7339942 26852
## - proads      1     36829  7344345 26854
## - pden        1     70050  7377566 26870
## - isle        1     80948  7388465 26875
## - gdp         1     82299  7389816 26875
## - area        1     88730  7396246 26878
## - wden        1     92558  7400075 26880
## - sea         1    123069  7430586 26894
## - csor        1   2403925  9711441 27824
## - cres        1   4038641 11346158 28364
## 
## Step:  AIC=26836.53
## tc ~ cres + csor + istat + area + pop + alt + isle + sea + pden + 
##     wden + urb + fee + sample + organic + paper + glass + wood + 
##     plastic + raee + texile + other + msw_so + msw_un + msw + 
##     sor + geo + roads + s_wteregio + s_landfill + gdp + proads + 
##     wage
## 
##              Df Sum of Sq      RSS   AIC
## - other       1        15  7307532 26828
## - alt         1        20  7307537 26828
## - msw         1        26  7307543 26828
## - fee         1        38  7307555 26828
## - msw_un      1        38  7307555 26828
## - msw_so      1        91  7307608 26828
## - urb         1       415  7307932 26829
## - s_landfill  1       572  7308089 26829
## - plastic     1      1048  7308565 26829
## - sample      1      1117  7308634 26829
## - geo         1      1796  7309313 26829
## - s_wteregio  1      3744  7311261 26830
## - texile      1      3791  7311308 26830
## - roads       1      4008  7311525 26830
## - pop         1      4570  7312087 26830
## - wood        1      7928  7315444 26832
## - paper       1     10938  7318454 26834
## - raee        1     11783  7319300 26834
## - sor         1     12043  7319560 26834
## - istat       1     13683  7321200 26835
## <none>                     7307517 26836
## - wage        1     22167  7329684 26839
## - glass       1     31382  7338899 26843
## - organic     1     32666  7340183 26844
## - proads      1     36832  7344349 26846
## - pden        1     70224  7377741 26862
## - isle        1     81402  7388918 26867
## - gdp         1     82469  7389986 26867
## - area        1     88819  7396336 26870
## - wden        1     92797  7400314 26872
## - sea         1    123190  7430706 26886
## - csor        1   2405935  9713452 27816
## - cres        1   4038674 11346191 28356
## 
## Step:  AIC=26828.38
## tc ~ cres + csor + istat + area + pop + alt + isle + sea + pden + 
##     wden + urb + fee + sample + organic + paper + glass + wood + 
##     plastic + raee + texile + msw_so + msw_un + msw + sor + geo + 
##     roads + s_wteregio + s_landfill + gdp + proads + wage
## 
##              Df Sum of Sq      RSS   AIC
## - alt         1        30  7307562 26820
## - msw         1        32  7307564 26820
## - fee         1        32  7307564 26820
## - msw_un      1        45  7307577 26820
## - msw_so      1       101  7307634 26820
## - urb         1       413  7307945 26820
## - s_landfill  1       560  7308092 26820
## - sample      1      1103  7308635 26821
## - plastic     1      1190  7308722 26821
## - geo         1      1845  7309377 26821
## - s_wteregio  1      3734  7311266 26822
## - texile      1      3856  7311388 26822
## - roads       1      4019  7311551 26822
## - pop         1      4593  7312125 26822
## - wood        1      7928  7315460 26824
## - paper       1     11693  7319225 26826
## - raee        1     11822  7319354 26826
## - istat       1     13688  7321220 26827
## - sor         1     16639  7324171 26828
## <none>                     7307532 26828
## - wage        1     22333  7329865 26831
## - glass       1     33615  7341147 26836
## - organic     1     36786  7344318 26838
## - proads      1     36919  7344451 26838
## - pden        1     70557  7378089 26854
## - isle        1     81601  7389134 26859
## - gdp         1     82675  7390207 26859
## - area        1     90858  7398390 26863
## - wden        1     93446  7400978 26864
## - sea         1    123458  7430990 26878
## - csor        1   2414555  9722088 27812
## - cres        1   4043259 11350791 28349
## 
## Step:  AIC=26820.24
## tc ~ cres + csor + istat + area + pop + isle + sea + pden + wden + 
##     urb + fee + sample + organic + paper + glass + wood + plastic + 
##     raee + texile + msw_so + msw_un + msw + sor + geo + roads + 
##     s_wteregio + s_landfill + gdp + proads + wage
## 
##              Df Sum of Sq      RSS   AIC
## - msw         1        33  7307595 26812
## - fee         1        38  7307600 26812
## - msw_un      1        47  7307609 26812
## - msw_so      1       104  7307666 26812
## - urb         1       396  7307958 26812
## - s_landfill  1       578  7308140 26812
## - sample      1      1136  7308698 26813
## - plastic     1      1238  7308800 26813
## - geo         1      1835  7309397 26813
## - s_wteregio  1      3707  7311269 26814
## - texile      1      3828  7311390 26814
## - roads       1      4070  7311632 26814
## - pop         1      4601  7312162 26814
## - wood        1      7946  7315508 26816
## - paper       1     11700  7319261 26818
## - raee        1     11887  7319449 26818
## - istat       1     13680  7321241 26819
## - sor         1     16691  7324253 26820
## <none>                     7307562 26820
## - wage        1     22718  7330280 26823
## - glass       1     34625  7342187 26828
## - organic     1     36785  7344347 26830
## - proads      1     39377  7346939 26831
## - pden        1     70557  7378118 26846
## - isle        1     81676  7389238 26851
## - gdp         1     88746  7396308 26854
## - area        1     92525  7400087 26856
## - wden        1     93452  7401013 26856
## - sea         1    132690  7440252 26875
## - csor        1   2414628  9722190 27803
## - cres        1   4080862 11388424 28353
## 
## Step:  AIC=26812.11
## tc ~ cres + csor + istat + area + pop + isle + sea + pden + wden + 
##     urb + fee + sample + organic + paper + glass + wood + plastic + 
##     raee + texile + msw_so + msw_un + sor + geo + roads + s_wteregio + 
##     s_landfill + gdp + proads + wage
## 
##              Df Sum of Sq      RSS   AIC
## - fee         1        36  7307631 26804
## - urb         1       385  7307980 26804
## - msw_un      1       417  7308012 26804
## - s_landfill  1       567  7308162 26804
## - sample      1      1153  7308748 26804
## - plastic     1      1249  7308844 26804
## - geo         1      1829  7309424 26805
## - s_wteregio  1      3679  7311274 26806
## - texile      1      3818  7311413 26806
## - roads       1      4096  7311691 26806
## - pop         1      4778  7312373 26806
## - msw_so      1      6838  7314433 26807
## - wood        1      7930  7315525 26808
## - paper       1     11686  7319281 26810
## - raee        1     11879  7319474 26810
## - istat       1     13716  7321311 26810
## - sor         1     16660  7324255 26812
## <none>                     7307595 26812
## - wage        1     22717  7330312 26815
## - glass       1     34637  7342232 26820
## - organic     1     36831  7344426 26821
## - proads      1     39488  7347083 26823
## - pden        1     71957  7379551 26838
## - isle        1     81758  7389353 26843
## - gdp         1     88850  7396445 26846
## - area        1     92521  7400116 26848
## - wden        1     95580  7403175 26849
## - sea         1    132941  7440536 26866
## - csor        1   2414819  9722414 27795
## - cres        1   4080937 11388532 28344
## 
## Step:  AIC=26803.97
## tc ~ cres + csor + istat + area + pop + isle + sea + pden + wden + 
##     urb + sample + organic + paper + glass + wood + plastic + 
##     raee + texile + msw_so + msw_un + sor + geo + roads + s_wteregio + 
##     s_landfill + gdp + proads + wage
## 
##              Df Sum of Sq      RSS   AIC
## - urb         1       372  7308003 26796
## - msw_un      1       410  7308041 26796
## - s_landfill  1       568  7308199 26796
## - sample      1      1117  7308748 26796
## - plastic     1      1258  7308889 26796
## - geo         1      1861  7309492 26797
## - s_wteregio  1      3781  7311412 26798
## - texile      1      3785  7311416 26798
## - roads       1      4092  7311723 26798
## - pop         1      4759  7312390 26798
## - msw_so      1      6820  7314452 26799
## - wood        1      8102  7315733 26800
## - paper       1     11651  7319282 26801
## - raee        1     11843  7319474 26801
## - istat       1     13899  7321530 26802
## - sor         1     17123  7324754 26804
## <none>                     7307631 26804
## - wage        1     22759  7330390 26807
## - glass       1     34859  7342490 26812
## - organic     1     36922  7344553 26813
## - proads      1     39492  7347123 26814
## - pden        1     71939  7379570 26830
## - isle        1     81848  7389479 26834
## - gdp         1     88999  7396630 26838
## - area        1     92888  7400519 26840
## - wden        1     95556  7403187 26841
## - sea         1    133036  7440667 26858
## - csor        1   2417522  9725153 27788
## - cres        1   4086816 11394447 28338
## 
## Step:  AIC=26795.99
## tc ~ cres + csor + istat + area + pop + isle + sea + pden + wden + 
##     sample + organic + paper + glass + wood + plastic + raee + 
##     texile + msw_so + msw_un + sor + geo + roads + s_wteregio + 
##     s_landfill + gdp + proads + wage
## 
##              Df Sum of Sq      RSS   AIC
## - msw_un      1       347  7308350 26788
## - s_landfill  1       547  7308550 26788
## - sample      1      1193  7309196 26788
## - plastic     1      1413  7309415 26788
## - geo         1      1787  7309790 26789
## - s_wteregio  1      3666  7311669 26790
## - texile      1      3850  7311853 26790
## - roads       1      3917  7311920 26790
## - pop         1      4596  7312599 26790
## - msw_so      1      6721  7314724 26791
## - wood        1      7986  7315989 26792
## - raee        1     11675  7319678 26793
## - paper       1     11728  7319731 26793
## - istat       1     14077  7322080 26794
## <none>                     7308003 26796
## - sor         1     18713  7326716 26797
## - wage        1     22723  7330725 26799
## - glass       1     36070  7344072 26805
## - organic     1     41013  7349016 26807
## - proads      1     41210  7349212 26807
## - pden        1     71593  7379596 26822
## - isle        1     81920  7389922 26826
## - gdp         1     89624  7397627 26830
## - area        1     93770  7401773 26832
## - wden        1     96165  7404167 26833
## - sea         1    135032  7443035 26851
## - csor        1   2417452  9725455 27780
## - cres        1   4087611 11395614 28330
## 
## Step:  AIC=26788.01
## tc ~ cres + csor + istat + area + pop + isle + sea + pden + wden + 
##     sample + organic + paper + glass + wood + plastic + raee + 
##     texile + msw_so + sor + geo + roads + s_wteregio + s_landfill + 
##     gdp + proads + wage
## 
##              Df Sum of Sq      RSS   AIC
## - s_landfill  1       508  7308858 26780
## - sample      1      1121  7309470 26780
## - plastic     1      1366  7309715 26780
## - geo         1      1778  7310127 26781
## - s_wteregio  1      3576  7311926 26782
## - roads       1      3729  7312078 26782
## - texile      1      3826  7312175 26782
## - msw_so      1      7437  7315787 26783
## - wood        1      7817  7316167 26784
## - raee        1     11567  7319917 26785
## - paper       1     11694  7320043 26785
## - pop         1     13608  7321957 26786
## - istat       1     14003  7322352 26786
## <none>                     7308350 26788
## - sor         1     18366  7326716 26789
## - wage        1     22672  7331022 26791
## - glass       1     35889  7344239 26797
## - organic     1     41121  7349470 26799
## - proads      1     41174  7349523 26799
## - pden        1     74436  7382785 26815
## - isle        1     82034  7390383 26819
## - gdp         1     89496  7397845 26822
## - area        1     93605  7401954 26824
## - wden        1     97597  7405947 26826
## - sea         1    136023  7444372 26844
## - csor        1   2421691  9730041 27774
## - cres        1   4123690 11432039 28333
## 
## Step:  AIC=26780.1
## tc ~ cres + csor + istat + area + pop + isle + sea + pden + wden + 
##     sample + organic + paper + glass + wood + plastic + raee + 
##     texile + msw_so + sor + geo + roads + s_wteregio + gdp + 
##     proads + wage
## 
##              Df Sum of Sq      RSS   AIC
## - sample      1      1261  7310118 26772
## - geo         1      1344  7310202 26773
## - plastic     1      1548  7310406 26773
## - roads       1      3561  7312419 26774
## - texile      1      4070  7312928 26774
## - s_wteregio  1      4079  7312937 26774
## - msw_so      1      7397  7316255 26776
## - wood        1      7560  7316418 26776
## - paper       1     11200  7320058 26777
## - raee        1     11463  7320321 26777
## - istat       1     13535  7322393 26778
## - pop         1     13636  7322494 26778
## <none>                     7308858 26780
## - sor         1     18019  7326877 26780
## - wage        1     23661  7332519 26783
## - glass       1     36608  7345466 26789
## - organic     1     40726  7349584 26791
## - proads      1     42037  7350895 26792
## - pden        1     75369  7384227 26808
## - isle        1     81918  7390776 26811
## - gdp         1     90224  7399081 26814
## - area        1     94699  7403557 26817
## - wden        1     98023  7406881 26818
## - sea         1    137812  7446670 26837
## - csor        1   2431722  9740579 27769
## - cres        1   4128949 11437807 28327
## 
## Step:  AIC=26772.54
## tc ~ cres + csor + istat + area + pop + isle + sea + pden + wden + 
##     organic + paper + glass + wood + plastic + raee + texile + 
##     msw_so + sor + geo + roads + s_wteregio + gdp + proads + 
##     wage
## 
##              Df Sum of Sq      RSS   AIC
## - geo         1       492  7310610 26765
## - plastic     1      1090  7311209 26765
## - roads       1      3507  7313625 26766
## - texile      1      3875  7313993 26766
## - s_wteregio  1      6103  7316221 26767
## - msw_so      1      7173  7317291 26768
## - wood        1      7902  7318020 26768
## - raee        1     11401  7321520 26770
## - paper       1     11455  7321573 26770
## - istat       1     12882  7323001 26770
## - pop         1     13210  7323329 26771
## <none>                     7310118 26772
## - sor         1     17190  7327308 26772
## - wage        1     22436  7332554 26775
## - glass       1     35350  7345468 26781
## - organic     1     40250  7350368 26784
## - proads      1     42461  7352580 26784
## - pden        1     74724  7384843 26800
## - isle        1     81011  7391129 26803
## - gdp         1     91593  7401711 26808
## - area        1     93448  7403566 26808
## - wden        1     97435  7407553 26810
## - sea         1    142450  7452568 26831
## - csor        1   2452332  9762450 27769
## - cres        1   4152512 11462631 28326
## 
## Step:  AIC=26764.62
## tc ~ cres + csor + istat + area + pop + isle + sea + pden + wden + 
##     organic + paper + glass + wood + plastic + raee + texile + 
##     msw_so + sor + roads + s_wteregio + gdp + proads + wage
## 
##              Df Sum of Sq      RSS   AIC
## - plastic     1      1338  7311948 26757
## - roads       1      3628  7314238 26758
## - texile      1      4192  7314802 26758
## - s_wteregio  1      5655  7316265 26759
## - msw_so      1      7381  7317992 26760
## - wood        1      7480  7318090 26760
## - paper       1     11139  7321749 26762
## - raee        1     11333  7321944 26762
## - pop         1     13393  7324003 26763
## <none>                     7310610 26765
## - sor         1     17295  7327905 26765
## - istat       1     21914  7332524 26767
## - wage        1     25679  7336289 26769
## - glass       1     35049  7345659 26773
## - organic     1     40277  7350887 26776
## - proads      1     42174  7352784 26776
## - pden        1     76012  7386622 26792
## - isle        1     80568  7391178 26794
## - gdp         1     91357  7401967 26800
## - area        1     93014  7403624 26800
## - wden        1     98273  7408883 26803
## - sea         1    142278  7452888 26823
## - csor        1   2457340  9767950 27763
## - cres        1   4152157 11462767 28318
## 
## Step:  AIC=26757.11
## tc ~ cres + csor + istat + area + pop + isle + sea + pden + wden + 
##     organic + paper + glass + wood + raee + texile + msw_so + 
##     sor + roads + s_wteregio + gdp + proads + wage
## 
##              Df Sum of Sq      RSS   AIC
## - roads       1      3601  7315549 26751
## - texile      1      4195  7316143 26751
## - s_wteregio  1      5378  7317326 26752
## - wood        1      6677  7318626 26752
## - msw_so      1      7425  7319374 26752
## - paper       1     10681  7322629 26754
## - raee        1     11190  7323138 26754
## - pop         1     13575  7325523 26755
## - sor         1     16175  7328123 26757
## <none>                     7311948 26757
## - istat       1     22598  7334546 26760
## - wage        1     25400  7337348 26761
## - glass       1     34183  7346131 26765
## - organic     1     39156  7351104 26768
## - proads      1     40940  7352889 26768
## - pden        1     77708  7389656 26786
## - isle        1     80617  7392565 26787
## - gdp         1     90063  7402012 26792
## - area        1     96187  7408136 26794
## - wden        1    100786  7412734 26796
## - sea         1    141683  7453631 26816
## - csor        1   2456252  9768200 27754
## - cres        1   4161056 11473005 28313
## 
## Step:  AIC=26750.66
## tc ~ cres + csor + istat + area + pop + isle + sea + pden + wden + 
##     organic + paper + glass + wood + raee + texile + msw_so + 
##     sor + s_wteregio + gdp + proads + wage
## 
##              Df Sum of Sq      RSS   AIC
## - texile      1      4121  7319670 26744
## - s_wteregio  1      5673  7321222 26745
## - msw_so      1      6755  7322304 26746
## - wood        1      6850  7322399 26746
## - paper       1     10621  7326170 26748
## - raee        1     11424  7326973 26748
## - pop         1     14088  7329637 26749
## - sor         1     15695  7331244 26750
## <none>                     7315549 26751
## - istat       1     23407  7338956 26754
## - wage        1     25530  7341080 26755
## - glass       1     33960  7349509 26759
## - organic     1     38018  7353567 26760
## - proads      1     38144  7353693 26761
## - pden        1     78073  7393622 26779
## - isle        1     80683  7396232 26781
## - gdp         1     86951  7402500 26784
## - area        1     92780  7408329 26786
## - wden        1    101202  7416751 26790
## - sea         1    140419  7455968 26808
## - csor        1   2460459  9776008 27749
## - cres        1   4177403 11492952 28311
## 
## Step:  AIC=26744.47
## tc ~ cres + csor + istat + area + pop + isle + sea + pden + wden + 
##     organic + paper + glass + wood + raee + msw_so + sor + s_wteregio + 
##     gdp + proads + wage
## 
##              Df Sum of Sq      RSS   AIC
## - s_wteregio  1      5149  7324819 26739
## - wood        1      6483  7326153 26739
## - msw_so      1      6936  7326606 26740
## - paper       1     10838  7330508 26742
## - raee        1     11188  7330858 26742
## - sor         1     13962  7333632 26743
## - pop         1     14314  7333984 26743
## <none>                     7319670 26744
## - istat       1     23416  7343086 26747
## - wage        1     24886  7344556 26748
## - glass       1     32842  7352512 26752
## - organic     1     37538  7357208 26754
## - proads      1     38448  7358118 26754
## - pden        1     79355  7399025 26774
## - isle        1     80480  7400150 26774
## - gdp         1     85353  7405023 26777
## - area        1     92527  7412197 26780
## - wden        1    102441  7422111 26785
## - sea         1    143469  7463139 26804
## - csor        1   2471975  9791645 27746
## - cres        1   4195992 11515662 28310
## 
## Step:  AIC=26738.75
## tc ~ cres + csor + istat + area + pop + isle + sea + pden + wden + 
##     organic + paper + glass + wood + raee + msw_so + sor + gdp + 
##     proads + wage
## 
##           Df Sum of Sq      RSS   AIC
## - msw_so   1      6372  7331191 26734
## - paper    1      7989  7332808 26734
## - wood     1      8498  7333317 26735
## - raee     1     12255  7337074 26736
## - sor      1     12964  7337783 26737
## - pop      1     13635  7338454 26737
## <none>                  7324819 26739
## - istat    1     21255  7346074 26741
## - wage     1     31211  7356029 26745
## - glass    1     34555  7359374 26747
## - organic  1     36785  7361603 26748
## - proads   1     40150  7364969 26750
## - pden     1     78912  7403731 26768
## - isle     1     79126  7403945 26768
## - gdp      1     87513  7412332 26772
## - area     1    100428  7425247 26778
## - wden     1    100856  7425675 26778
## - sea      1    172129  7496948 26811
## - csor     1   2516217  9841036 27756
## - cres     1   4234724 11559543 28315
## 
## Step:  AIC=26733.62
## tc ~ cres + csor + istat + area + pop + isle + sea + pden + wden + 
##     organic + paper + glass + wood + raee + sor + gdp + proads + 
##     wage
## 
##           Df Sum of Sq      RSS   AIC
## - paper    1      7905  7339096 26729
## - wood     1      8333  7339524 26729
## - raee     1     11312  7342503 26731
## - sor      1     16976  7348168 26734
## <none>                  7331191 26734
## - pop      1     19265  7350456 26735
## - istat    1     21722  7352913 26736
## - wage     1     31249  7362440 26740
## - glass    1     34926  7366117 26742
## - proads   1     39714  7370905 26744
## - organic  1     42319  7373510 26746
## - isle     1     79049  7410240 26763
## - gdp      1     87578  7418769 26767
## - pden     1     90444  7421635 26768
## - area     1    105528  7436719 26775
## - wden     1    112908  7444100 26778
## - sea      1    169201  7500392 26805
## - csor     1   2521263  9852455 27752
## - cres     1   4247656 11578847 28312
## 
## Step:  AIC=26729.21
## tc ~ cres + csor + istat + area + pop + isle + sea + pden + wden + 
##     organic + glass + wood + raee + sor + gdp + proads + wage
## 
##           Df Sum of Sq      RSS   AIC
## - wood     1      7232  7346328 26724
## - sor      1     10230  7349326 26726
## - raee     1     11736  7350833 26727
## <none>                  7339096 26729
## - pop      1     20051  7359148 26730
## - istat    1     21515  7360611 26731
## - wage     1     33012  7372108 26737
## - glass    1     33231  7372327 26737
## - organic  1     36138  7375234 26738
## - proads   1     39697  7378793 26740
## - isle     1     79633  7418729 26758
## - pden     1     88531  7427627 26763
## - gdp      1     90272  7429368 26764
## - area     1    104054  7443150 26770
## - wden     1    111658  7450754 26774
## - sea      1    169403  7508499 26800
## - csor     1   2563792  9902888 27761
## - cres     1   4273723 11612819 28314
## 
## Step:  AIC=26724.48
## tc ~ cres + csor + istat + area + pop + isle + sea + pden + wden + 
##     organic + glass + raee + sor + gdp + proads + wage
## 
##           Df Sum of Sq      RSS   AIC
## - sor      1      6274  7352602 26719
## - raee     1      9353  7355681 26721
## - istat    1     16185  7362513 26724
## <none>                  7346328 26724
## - pop      1     19244  7365572 26725
## - glass    1     27833  7374161 26730
## - organic  1     31826  7378154 26731
## - wage     1     36644  7382972 26734
## - proads   1     43714  7390042 26737
## - isle     1     79369  7425697 26754
## - pden     1     84033  7430361 26756
## - gdp      1     98082  7444410 26762
## - wden     1    106728  7453056 26766
## - area     1    106926  7453254 26766
## - sea      1    179681  7526009 26800
## - csor     1   2573312  9919640 27759
## - cres     1   4285415 11631744 28312
## 
## Step:  AIC=26719.29
## tc ~ cres + csor + istat + area + pop + isle + sea + pden + wden + 
##     organic + glass + raee + gdp + proads + wage
## 
##           Df Sum of Sq      RSS   AIC
## - raee     1     13700  7366302 26718
## <none>                  7352602 26719
## - istat    1     19298  7371900 26720
## - pop      1     21345  7373947 26721
## - glass    1     22834  7375436 26722
## - organic  1     25762  7378364 26723
## - wage     1     33608  7386210 26727
## - proads   1     38026  7390628 26729
## - isle     1     79252  7431854 26748
## - pden     1     89746  7442348 26753
## - gdp      1     91956  7444558 26754
## - area     1    114020  7466622 26765
## - wden     1    115424  7468026 26765
## - sea      1    178297  7530899 26794
## - csor     1   2728190 10080792 27807
## - cres     1   4622599 11975201 28405
## 
## Step:  AIC=26717.6
## tc ~ cres + csor + istat + area + pop + isle + sea + pden + wden + 
##     organic + glass + gdp + proads + wage
## 
##           Df Sum of Sq      RSS   AIC
## <none>                  7366302 26718
## - glass    1     20830  7387132 26719
## - pop      1     22076  7388378 26720
## - organic  1     23585  7389887 26720
## - istat    1     25238  7391540 26721
## - wage     1     29337  7395639 26723
## - proads   1     39790  7406092 26728
## - isle     1     86137  7452439 26750
## - gdp      1     86385  7452688 26750
## - pden     1     88293  7454596 26751
## - wden     1    113035  7479338 26762
## - area     1    119260  7485562 26765
## - sea      1    175306  7541608 26791
## - csor     1   2741206 10107508 27808
## - cres     1   4613246 11979548 28398
summary(lm_bwd)
## 
## Call:
## lm(formula = tc ~ cres + csor + istat + area + pop + isle + sea + 
##     pden + wden + organic + glass + gdp + proads + wage, data = train_data[, 
##     4:(ncol(train_data) - 1)])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -822.04  -24.76   -7.73   15.76  656.25 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.473e+01  4.847e+00  13.354  < 2e-16 ***
## cres         9.391e-01  2.018e-02  46.529  < 2e-16 ***
## csor         8.918e-01  2.486e-02  35.867  < 2e-16 ***
## istat       -9.586e-05  2.785e-05  -3.442 0.000585 ***
## area         1.293e-01  1.728e-02   7.481 9.29e-14 ***
## pop         -5.482e-05  1.703e-05  -3.219 0.001300 ** 
## isle         6.847e+01  1.077e+01   6.358 2.31e-10 ***
## sea          2.097e+01  2.312e+00   9.070  < 2e-16 ***
## pden        -3.413e-02  5.303e-03  -6.437 1.39e-10 ***
## wden         8.411e-05  1.155e-05   7.283 4.01e-13 ***
## organic     -2.103e-01  6.322e-02  -3.327 0.000887 ***
## glass       -6.794e-01  2.173e-01  -3.127 0.001783 ** 
## gdp          4.356e+00  6.841e-01   6.367 2.18e-10 ***
## proads      -3.048e+00  7.054e-01  -4.321 1.60e-05 ***
## wage        -2.261e+00  6.092e-01  -3.711 0.000210 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 46.16 on 3457 degrees of freedom
## Multiple R-squared:  0.6293, Adjusted R-squared:  0.6278 
## F-statistic: 419.2 on 14 and 3457 DF,  p-value: < 2.2e-16
names(coef(lm_full))
##  [1] "(Intercept)" "cres"        "csor"        "istat"       "area"       
##  [6] "pop"         "alt"         "isle"        "sea"         "pden"       
## [11] "wden"        "urb"         "feeSTANDARD" "d_fee"       "sample"     
## [16] "organic"     "paper"       "glass"       "wood"        "metal"      
## [21] "plastic"     "raee"        "texile"      "other"       "msw_so"     
## [26] "msw_un"      "msw"         "sor"         "geo"         "roads"      
## [31] "s_wteregio"  "s_landfill"  "gdp"         "proads"      "wage"       
## [36] "finance"
names(coef(lm_bwd))
##  [1] "(Intercept)" "cres"        "csor"        "istat"       "area"       
##  [6] "pop"         "isle"        "sea"         "pden"        "wden"       
## [11] "organic"     "glass"       "gdp"         "proads"      "wage"
lm_full_pred <- predict(lm_full, newdata=test_data)
lm_bwd_pred <- predict(lm_bwd, newdata=test_data)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
print(accuracy(lm_full_pred, test_data$tc))
##                   ME     RMSE     MAE       MPE     MAPE
## Test set -0.03499095 41.92483 27.7932 -6.059496 18.49293

Visual comparison

plot_dat <- cbind.data.frame(lm_bwd_pred, test_data$tc)
names(plot_dat) <- c("predicted", "actual")

ggplot(plot_dat, aes ( x = predicted, y  = actual)) +
  geom_point() +
  geom_smooth() +
  xlim(-100, 850) +
  ylim(-100, 850) +
  geom_abline(slope = 1, linetype = 2)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

print(accuracy(lm_bwd_pred, test_data$tc))
##                  ME     RMSE      MAE       MPE    MAPE
## Test set -0.1313893 41.92336 27.80371 -6.111286 18.4673

Decision Trees

#install.packages("ggplot2")
#install.packages("rpart")                      # Popular decision tree algorithm
#install.packages("rattle")                 # Fancy tree plot
#install.packages("rpart.plot")             # Enhanced tree plots
#install.packages("RColorBrewer")               # Color selection for fancy tree plot
#install.packages("party")                  # Alternative decision tree algorithm
#install.packages("partykit")               # Convert rpart object to BinaryTree
#install.packages("caret")      
# install.packages("splitstackshape")
library(ggplot2)
library(rpart)                      # Popular decision tree algorithm
library(rattle)                 # Fancy tree plot
## Loading required package: tibble
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(rpart.plot)             # Enhanced tree plots
library(RColorBrewer)               # Color selection for fancy tree plot
library(party)                  # Alternative decision tree algorithm
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
library(partykit)               # Convert rpart object to BinaryTree
## Loading required package: libcoin
## 
## Attaching package: 'partykit'
## The following objects are masked from 'package:party':
## 
##     cforest, ctree, ctree_control, edge_simple, mob, mob_control,
##     node_barplot, node_bivplot, node_boxplot, node_inner, node_surv,
##     node_terminal, varimp
library(caret)  
## Loading required package: lattice
library(reshape2) # Load reshape 2 for melting
library(DMwR) # Load data mining with R for SMOTE
library(splitstackshape) # Used for stratified sampling
# Build the decision tree model
tree_model <- rpart(tc ~ ., data = train_data[, 4:(ncol(train_data) - 1)], method = "anova") # 'anova' for continuous target

# Visualize the tree
rpart.plot(tree_model, type = 3, digits = 3, fallen.leaves = TRUE, cex = 0.7)

Interpretation

Key Components of the Tree: Nodes:

Each internal node (rectangular split) represents a decision based on a specific feature (e.g., cres, csor, pop). The leaf nodes (rounded at the bottom) represent the final predictions or outputs. Splits:

Each split is a condition on a feature. For example: cres < 124 is the first split, dividing the data based on whether this condition is true or false. These splits help isolate subsets of data with similar characteristics. Values in Leaves:

The numbers in each leaf represent the predicted values or proportions for the dependent variable. For example: 103 (20.1%) means this leaf has a predicted value of 103, and 20.1% of the data falls into this category.

The feature at the top (cres) is the most important for splitting the data, as it’s used in the root node. Other features like csor and pop are less influential but still important for refining predictions.

fancyRpartPlot(tree_model) # Plot fancy tree

tree_model
## n= 3472 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 3472 19871340.0 154.1717  
##    2) cres< 124.31 3255 10130030.0 144.2435  
##      4) csor< 91.035 2975  6885656.0 137.0712  
##        8) cres< 41.905 1640  2431924.0 119.5457  
##         16) csor< 44.925 697   756329.6 103.2627 *
##         17) csor>=44.925 943  1354202.0 131.5810 *
##        9) cres>=41.905 1335  3331219.0 158.6007  
##         18) csor< 55.985 931  1992266.0 147.3217 *
##         19) csor>=55.985 404   947579.5 184.5926  
##           38) cres< 82.83 331   494683.0 173.0153 *
##           39) cres>=82.83 73   207365.9 237.0873 *
##      5) csor>=91.035 280  1465313.0 220.4486  
##       10) cres< 69.665 217   480695.8 195.6440 *
##       11) cres>=69.665 63   391227.5 305.8865 *
##    3) cres>=124.31 217  4607846.0 303.0945  
##      6) csor< 141.24 199  3051299.0 281.5866  
##       12) cres< 178.465 141  1036042.0 243.6839  
##         24) csor< 38.045 91   242300.8 211.2895 *
##         25) csor>=38.045 50   524444.3 302.6418 *
##       13) cres>=178.465 58  1320256.0 373.7295  
##         26) pop>=305 49   463768.8 338.7927 *
##         27) pop< 305 9   471054.1 563.9411 *
##      7) csor>=141.24 18   446775.2 540.8756 *
summary(tree_model)
## Call:
## rpart(formula = tc ~ ., data = train_data[, 4:(ncol(train_data) - 
##     1)], method = "anova")
##   n= 3472 
## 
##            CP nsplit rel error    xerror       xstd
## 1  0.25833528      0 1.0000000 1.0009003 0.07612054
## 2  0.08952885      1 0.7416647 0.7620112 0.04984418
## 3  0.05648907      2 0.6521359 0.6836191 0.04732935
## 4  0.05584786      3 0.5956468 0.6469274 0.04461505
## 5  0.03497502      4 0.5397989 0.6001122 0.04385275
## 6  0.02986160      5 0.5048239 0.6014537 0.04607272
## 7  0.01969533      6 0.4749623 0.5626851 0.04473741
## 8  0.01939641      7 0.4552670 0.5174028 0.03772581
## 9  0.01617368      8 0.4358706 0.5056735 0.03762118
## 10 0.01355203      9 0.4196969 0.4979615 0.03751073
## 11 0.01235602     10 0.4061449 0.4860922 0.03653164
## 12 0.01000000     11 0.3937889 0.4770283 0.03539802
## 
## Variable importance
##    cres    csor     sor     pop finance     gdp   metal  msw_un organic     msw 
##      49      25       4       3       2       2       2       2       2       2 
##   paper     geo    pden     alt 
##       2       2       1       1 
## 
## Node number 1: 3472 observations,    complexity param=0.2583353
##   mean=154.1717, MSE=5723.313 
##   left son=2 (3255 obs) right son=3 (217 obs)
##   Primary splits:
##       cres    < 124.31   to the left,  improve=0.25833530, (0 missing)
##       csor    < 112.795  to the left,  improve=0.15195620, (0 missing)
##       sea     < 0.5      to the left,  improve=0.12044290, (0 missing)
##       gdp     < 7.325502 to the left,  improve=0.09884944, (0 missing)
##       finance < 7.545267 to the left,  improve=0.09884944, (0 missing)
##   Surrogate splits:
##       csor   < 205      to the left,  agree=0.939, adj=0.023, (0 split)
##       msw_so < 8552.5   to the right, agree=0.938, adj=0.009, (0 split)
##       pop    < 72.5     to the right, agree=0.938, adj=0.005, (0 split)
##       sor    < 0.71     to the right, agree=0.938, adj=0.005, (0 split)
## 
## Node number 2: 3255 observations,    complexity param=0.08952885
##   mean=144.2435, MSE=3112.144 
##   left son=4 (2975 obs) right son=5 (280 obs)
##   Primary splits:
##       csor       < 91.035   to the left,  improve=0.17562230, (0 missing)
##       cres       < 48.795   to the left,  improve=0.16756350, (0 missing)
##       sea        < 0.5      to the left,  improve=0.11856350, (0 missing)
##       s_landfill < 13.27579 to the left,  improve=0.09471711, (0 missing)
##       geo        < 2.5      to the right, improve=0.09379906, (0 missing)
##   Surrogate splits:
##       istat   < 111099.5 to the left,  agree=0.915, adj=0.007, (0 split)
##       paper   < 40.16635 to the left,  agree=0.915, adj=0.007, (0 split)
##       gdp     < 9.076158 to the left,  agree=0.914, adj=0.004, (0 split)
##       finance < 9.348442 to the left,  agree=0.914, adj=0.004, (0 split)
## 
## Node number 3: 217 observations,    complexity param=0.05584786
##   mean=303.0945, MSE=21234.31 
##   left son=6 (199 obs) right son=7 (18 obs)
##   Primary splits:
##       csor    < 141.24   to the left,  improve=0.2408440, (0 missing)
##       cres    < 172.045  to the left,  improve=0.2118968, (0 missing)
##       gdp     < 8.290743 to the left,  improve=0.1875395, (0 missing)
##       finance < 8.539466 to the left,  improve=0.1875395, (0 missing)
##       alt     < 7.5      to the right, improve=0.1209677, (0 missing)
## 
## Node number 4: 2975 observations,    complexity param=0.05648907
##   mean=137.0712, MSE=2314.506 
##   left son=8 (1640 obs) right son=9 (1335 obs)
##   Primary splits:
##       cres       < 41.905   to the left,  improve=0.16302200, (0 missing)
##       geo        < 2.5      to the right, improve=0.11420950, (0 missing)
##       istat      < 31024.5  to the left,  improve=0.10912740, (0 missing)
##       s_landfill < 13.27579 to the left,  improve=0.10504870, (0 missing)
##       msw_un     < 1533840  to the left,  improve=0.09428845, (0 missing)
##   Surrogate splits:
##       sor     < 66.735   to the right, agree=0.748, adj=0.439, (0 split)
##       metal   < 1.235346 to the right, agree=0.665, adj=0.254, (0 split)
##       organic < 20.68348 to the right, agree=0.657, adj=0.236, (0 split)
##       geo     < 2.5      to the right, agree=0.651, adj=0.223, (0 split)
##       paper   < 9.066684 to the right, agree=0.648, adj=0.216, (0 split)
## 
## Node number 5: 280 observations,    complexity param=0.0298616
##   mean=220.4486, MSE=5233.262 
##   left son=10 (217 obs) right son=11 (63 obs)
##   Primary splits:
##       cres  < 69.665   to the left,  improve=0.40495780, (0 missing)
##       csor  < 145.89   to the left,  improve=0.15949150, (0 missing)
##       sea   < 0.5      to the left,  improve=0.13084540, (0 missing)
##       alt   < 15.5     to the right, improve=0.10410440, (0 missing)
##       istat < 22520    to the right, improve=0.08625579, (0 missing)
##   Surrogate splits:
##       alt  < 4.5      to the right, agree=0.789, adj=0.063, (0 split)
##       pden < 11.1302  to the right, agree=0.789, adj=0.063, (0 split)
##       sor  < 42.125   to the right, agree=0.789, adj=0.063, (0 split)
##       csor < 160.715  to the left,  agree=0.786, adj=0.048, (0 split)
##       area < 2.715    to the right, agree=0.786, adj=0.048, (0 split)
## 
## Node number 6: 199 observations,    complexity param=0.03497502
##   mean=281.5866, MSE=15333.16 
##   left son=12 (141 obs) right son=13 (58 obs)
##   Primary splits:
##       cres    < 178.465  to the left,  improve=0.2277721, (0 missing)
##       gdp     < 8.290991 to the left,  improve=0.2274784, (0 missing)
##       finance < 8.539721 to the left,  improve=0.2274784, (0 missing)
##       csor    < 37.905   to the left,  improve=0.1698224, (0 missing)
##       pop     < 220.5    to the right, improve=0.1060024, (0 missing)
##   Surrogate splits:
##       sor     < 23.885   to the right, agree=0.734, adj=0.086, (0 split)
##       gdp     < 8.502975 to the left,  agree=0.734, adj=0.086, (0 split)
##       finance < 8.758064 to the left,  agree=0.734, adj=0.086, (0 split)
##       isle    < 0.5      to the left,  agree=0.729, adj=0.069, (0 split)
##       glass   < 1.93376  to the right, agree=0.729, adj=0.069, (0 split)
## 
## Node number 7: 18 observations
##   mean=540.8756, MSE=24820.84 
## 
## Node number 8: 1640 observations,    complexity param=0.01617368
##   mean=119.5457, MSE=1482.881 
##   left son=16 (697 obs) right son=17 (943 obs)
##   Primary splits:
##       csor       < 44.925   to the left,  improve=0.13215570, (0 missing)
##       s_landfill < 13.27579 to the left,  improve=0.12435610, (0 missing)
##       istat      < 31514    to the left,  improve=0.08575976, (0 missing)
##       geo        < 2.5      to the right, improve=0.08087859, (0 missing)
##       msw_un     < 802895   to the left,  improve=0.06796906, (0 missing)
##   Surrogate splits:
##       plastic < 3.762071 to the left,  agree=0.618, adj=0.102, (0 split)
##       sor     < 60.725   to the left,  agree=0.599, adj=0.057, (0 split)
##       istat   < 24564    to the left,  agree=0.599, adj=0.056, (0 split)
##       alt     < 616.5    to the right, agree=0.599, adj=0.056, (0 split)
##       metal   < 3.097131 to the right, agree=0.596, adj=0.050, (0 split)
## 
## Node number 9: 1335 observations,    complexity param=0.01969533
##   mean=158.6007, MSE=2495.295 
##   left son=18 (931 obs) right son=19 (404 obs)
##   Primary splits:
##       csor   < 55.985   to the left,  improve=0.11748630, (0 missing)
##       cres   < 65.84    to the left,  improve=0.08717904, (0 missing)
##       sea    < 0.5      to the left,  improve=0.08667848, (0 missing)
##       msw_un < 1536435  to the left,  improve=0.08513964, (0 missing)
##       area   < 80.545   to the left,  improve=0.06546742, (0 missing)
##   Surrogate splits:
##       wood   < 8.147758 to the left,  agree=0.706, adj=0.027, (0 split)
##       msw_so < 25241050 to the left,  agree=0.705, adj=0.025, (0 split)
##       msw_un < 4181055  to the left,  agree=0.704, adj=0.022, (0 split)
##       msw    < 17460710 to the left,  agree=0.703, adj=0.020, (0 split)
##       pop    < 41309.5  to the left,  agree=0.703, adj=0.017, (0 split)
## 
## Node number 10: 217 observations
##   mean=195.644, MSE=2215.188 
## 
## Node number 11: 63 observations
##   mean=305.8865, MSE=6209.96 
## 
## Node number 12: 141 observations,    complexity param=0.01355203
##   mean=243.6839, MSE=7347.817 
##   left son=24 (91 obs) right son=25 (50 obs)
##   Primary splits:
##       csor    < 38.045   to the left,  improve=0.25992870, (0 missing)
##       alt     < 43       to the right, improve=0.17914240, (0 missing)
##       gdp     < 7.857229 to the left,  improve=0.11904880, (0 missing)
##       finance < 8.092946 to the left,  improve=0.11904880, (0 missing)
##       wage    < 9.405493 to the left,  improve=0.09072908, (0 missing)
##   Surrogate splits:
##       alt     < 15.5     to the right, agree=0.716, adj=0.20, (0 split)
##       wage    < 9.517115 to the left,  agree=0.709, adj=0.18, (0 split)
##       istat   < 16223    to the right, agree=0.681, adj=0.10, (0 split)
##       glass   < 11.00022 to the left,  agree=0.674, adj=0.08, (0 split)
##       plastic < 10.80984 to the left,  agree=0.674, adj=0.08, (0 split)
## 
## Node number 13: 58 observations,    complexity param=0.01939641
##   mean=373.7295, MSE=22763.03 
##   left son=26 (49 obs) right son=27 (9 obs)
##   Primary splits:
##       pop     < 305      to the right, improve=0.2919379, (0 missing)
##       gdp     < 8.306887 to the left,  improve=0.2779149, (0 missing)
##       finance < 8.556094 to the left,  improve=0.2779149, (0 missing)
##       msw_un  < 119585   to the right, improve=0.2313858, (0 missing)
##       pden    < 13.36629 to the right, improve=0.1955234, (0 missing)
##   Surrogate splits:
##       msw_un  < 133180   to the right, agree=0.948, adj=0.667, (0 split)
##       msw     < 195999.5 to the right, agree=0.948, adj=0.667, (0 split)
##       gdp     < 8.614421 to the left,  agree=0.948, adj=0.667, (0 split)
##       finance < 8.872854 to the left,  agree=0.948, adj=0.667, (0 split)
##       pden    < 10.27217 to the right, agree=0.914, adj=0.444, (0 split)
## 
## Node number 16: 697 observations
##   mean=103.2627, MSE=1085.121 
## 
## Node number 17: 943 observations
##   mean=131.581, MSE=1436.057 
## 
## Node number 18: 931 observations
##   mean=147.3217, MSE=2139.921 
## 
## Node number 19: 404 observations,    complexity param=0.01235602
##   mean=184.5926, MSE=2345.494 
##   left son=38 (331 obs) right son=39 (73 obs)
##   Primary splits:
##       cres    < 82.83    to the left,  improve=0.25911350, (0 missing)
##       sea     < 0.5      to the left,  improve=0.12954860, (0 missing)
##       msw_un  < 520535   to the left,  improve=0.09093708, (0 missing)
##       sor     < 50.385   to the right, improve=0.07881458, (0 missing)
##       organic < 18.56773 to the right, improve=0.07301852, (0 missing)
##   Surrogate splits:
##       plastic < 1.673275 to the right, agree=0.832, adj=0.068, (0 split)
##       sor     < 45.985   to the right, agree=0.832, adj=0.068, (0 split)
##       istat   < 9020     to the right, agree=0.827, adj=0.041, (0 split)
##       isle    < 0.5      to the left,  agree=0.827, adj=0.041, (0 split)
##       msw_so  < 32852    to the right, agree=0.827, adj=0.041, (0 split)
## 
## Node number 24: 91 observations
##   mean=211.2895, MSE=2662.646 
## 
## Node number 25: 50 observations
##   mean=302.6418, MSE=10488.89 
## 
## Node number 26: 49 observations
##   mean=338.7927, MSE=9464.67 
## 
## Node number 27: 9 observations
##   mean=563.9411, MSE=52339.34 
## 
## Node number 38: 331 observations
##   mean=173.0153, MSE=1494.51 
## 
## Node number 39: 73 observations
##   mean=237.0873, MSE=2840.628
printcp(tree_model)
## 
## Regression tree:
## rpart(formula = tc ~ ., data = train_data[, 4:(ncol(train_data) - 
##     1)], method = "anova")
## 
## Variables actually used in tree construction:
## [1] cres csor pop 
## 
## Root node error: 19871343/3472 = 5723.3
## 
## n= 3472 
## 
##          CP nsplit rel error  xerror     xstd
## 1  0.258335      0   1.00000 1.00090 0.076121
## 2  0.089529      1   0.74166 0.76201 0.049844
## 3  0.056489      2   0.65214 0.68362 0.047329
## 4  0.055848      3   0.59565 0.64693 0.044615
## 5  0.034975      4   0.53980 0.60011 0.043853
## 6  0.029862      5   0.50482 0.60145 0.046073
## 7  0.019695      6   0.47496 0.56269 0.044737
## 8  0.019396      7   0.45527 0.51740 0.037726
## 9  0.016174      8   0.43587 0.50567 0.037621
## 10 0.013552      9   0.41970 0.49796 0.037511
## 11 0.012356     10   0.40614 0.48609 0.036532
## 12 0.010000     11   0.39379 0.47703 0.035398
plotcp(tree_model) 

tree_model_2 <- rpart(tc ~., # Set tree formula
data = train_data[, 4:(ncol(train_data) - 1)], # Set data
control = rpart.control(cp = 0.022)) # Set parameters
fancyRpartPlot(tree_model_2) # Plot fancy tree

Random Forest & XGBoost

#install.packages("randomForest")
#install.packages("caret")
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:rattle':
## 
##     importance
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(caret)
library(xgboost)
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:rattle':
## 
##     xgboost

Best Random Forest

rf_mod <- randomForest(tc ~., # Set tree formula
                         data = train_data[, 4:(ncol(train_data) - 1)], # Set dataset
                         ntree = 200,
                         nodesize = 1,
                         mtry = 12) # Set number of trees to use
rf_preds <- predict(rf_mod, test_data) # Create predictions for random forest model
plot_dat <- cbind.data.frame(rf_preds, test_data$tc)
names(plot_dat) <- c("predicted", "actual")

ggplot(plot_dat, aes ( x = predicted, y  = actual)) +
  geom_point() +
  geom_smooth() +
  xlim(-100, 850) +
  ylim(-100, 850) +
  geom_abline(slope = 1, linetype = 2)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Interpretation

When we compare this model rf_preds compared to lm_bwd_pred we can see that rf_preds prediction is even closer to the actual compared to the lm_bwd_pred model.

Apply an XGBoost model to the Waste dataset

library(fastDummies)

dummy_data <- dummy_cols(waste_data[, 5:(ncol(waste_data) - 1)], remove_selected_columns = TRUE)
train_dummy <- dummy_data[train_data_indices,]
test_dummy <- dummy_data[-train_data_indices,]
dtrain <- xgb.DMatrix(data = as.matrix(train_dummy), label = as.numeric(train_data$tc))
# Create test matrix
dtest <- xgb.DMatrix(data = as.matrix(test_dummy), label = as.numeric(test_data$tc))

XGBoost

set.seed(111111)
bst_1 <- xgboost(data = dtrain, # Set training data
               
               nrounds = 100, # Set number of rounds
               
               verbose = 1, # 1 - Prints out fit
                print_every_n = 20 # Prints out result every 20th iteration
               
        ) # Set evaluation metric to use
## [1]  train-rmse:124.492289 
## [21] train-rmse:17.851580 
## [41] train-rmse:11.960609 
## [61] train-rmse:8.847052 
## [81] train-rmse:6.697660 
## [100]    train-rmse:5.242919
bst_preds <- predict(bst_1, dtest)
print(accuracy(bst_preds, test_data$tc))
##               ME     RMSE      MAE       MPE     MAPE
## Test set 1.30633 41.37576 24.23526 -3.476268 15.34897

MEA has gone down

plot_dat <- cbind.data.frame(bst_preds, test_data$tc)
names(plot_dat) <- c("predicted", "actual")

ggplot(plot_dat, aes ( x = predicted, y  = actual)) +
  geom_point() +
  geom_smooth() +
  xlim(-100, 850) +
  ylim(-100, 850) +
  geom_abline(slope = 1, linetype = 2)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Interpretation Even better predicition

Variable Importance

# Extract importance
imp_mat <- xgb.importance(model = bst_1)
# Plot importance (top 10 variables)
xgb.plot.importance(imp_mat, top_n = 10)

Interpretation cres and csor have the highest importance scores, indicating that these features contribute the most to the total cost predictions. These variables likely have strong predictive power or influence over the target variable

source("~/Downloads/a_insights_shap_functions.r")

SHAP

# Calculate SHAP importance
shap_result <- shap.score.rank(xgb_model = bst_1, 
                X_train =as.matrix(train_dummy),
                shap_approx = F)
## Loading required package: data.table
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:zoo':
## 
##     yearmon, yearqtr
## The following objects are masked from 'package:reshape2':
## 
##     dcast, melt
## make SHAP score by decreasing order
shap_long = shap.prep(shap = shap_result,
                           X_train = as.matrix(train_dummy), 
                           top_n = 10)
## Loading required package: ggforce
plot.shap.summary(data_long = shap_long)

Interpretation cres has the highest average SHAP value, indicating it is the most influential feature in the model’s predictions.

lead to higher cost

max_depth_vals <- c(3, 5, 7, 10, 15) # Create vector of max depth values
min_child_weight <- c(1,3,5,7, 10, 15) # Create vector of min child values

# Expand grid of parameter values
cv_params <- expand.grid(max_depth_vals, min_child_weight)
names(cv_params) <- c("max_depth", "min_child_weight")
# Create results vector
rmse_vec  <- rep(NA, nrow(cv_params)) 
# Loop through results
for(i in 1:nrow(cv_params)){
  set.seed(111111)
  bst_tune <- xgb.cv(data = dtrain, # Set training data
                     
                     nfold = 5, # Use 5 fold cross-validation
                     
                     eta = 0.1, # Set learning rate
                     max.depth = cv_params$max_depth[i], # Set max depth
                     min_child_weight = cv_params$min_child_weight[i], # Set minimum number of samples in node to split
                     
                     
                     nrounds = 400, # Set number of rounds
                     early_stopping_rounds = 20, # Set number of rounds to stop at if there is no improvement
                     
                     verbose = 1, # 1 - Prints out fit
                     nthread = 1, # Set number of parallel threads
                     print_every_n = 20 # Prints out result every 20th iteration
                     
  ) # Set evaluation metric to use
  
  rmse_vec[i] <- bst_tune$evaluation_log$test_rmse_mean[bst_tune$best_ntreelimit]
  
  
}
## [1]  train-rmse:155.923219+0.995006  test-rmse:156.025261+4.444763 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:45.093593+0.530261   test-rmse:50.106003+5.138494 
## [41] train-rmse:35.235581+0.556793   test-rmse:43.258349+4.899167 
## [61] train-rmse:32.371356+0.653130   test-rmse:41.639124+4.396476 
## [81] train-rmse:30.484983+0.554332   test-rmse:40.996523+4.631632 
## [101]    train-rmse:28.926941+0.569101   test-rmse:40.495324+4.641312 
## [121]    train-rmse:27.633749+0.495846   test-rmse:40.267839+4.679667 
## [141]    train-rmse:26.471459+0.504642   test-rmse:40.001346+4.595750 
## [161]    train-rmse:25.388633+0.477942   test-rmse:39.800069+4.645083 
## [181]    train-rmse:24.459427+0.461160   test-rmse:39.546817+4.538143 
## [201]    train-rmse:23.575082+0.445557   test-rmse:39.338964+4.456426 
## [221]    train-rmse:22.874367+0.437115   test-rmse:39.236728+4.476173 
## [241]    train-rmse:22.140840+0.375995   test-rmse:39.118551+4.497049 
## [261]    train-rmse:21.433876+0.350244   test-rmse:38.973601+4.543437 
## [281]    train-rmse:20.872481+0.371997   test-rmse:38.913844+4.534199 
## [301]    train-rmse:20.314922+0.355746   test-rmse:38.880581+4.580636 
## [321]    train-rmse:19.767130+0.307719   test-rmse:38.820683+4.627091 
## [341]    train-rmse:19.237818+0.321120   test-rmse:38.736026+4.651512 
## [361]    train-rmse:18.721295+0.280269   test-rmse:38.654334+4.644152 
## [381]    train-rmse:18.243752+0.288682   test-rmse:38.591976+4.649406 
## [400]    train-rmse:17.863479+0.247630   test-rmse:38.618163+4.675198 
## [1]  train-rmse:155.635524+0.990844  test-rmse:155.884116+4.459781 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:37.306239+0.359437   test-rmse:47.834205+6.275392 
## [41] train-rmse:25.628996+0.378135   test-rmse:41.570322+6.165153 
## [61] train-rmse:21.848797+0.726270   test-rmse:40.344041+6.261118 
## [81] train-rmse:19.547466+0.842682   test-rmse:39.870137+6.408156 
## [101]    train-rmse:17.610416+0.709637   test-rmse:39.640529+6.542050 
## [121]    train-rmse:16.077955+0.725726   test-rmse:39.437318+6.571280 
## [141]    train-rmse:14.808825+0.651605   test-rmse:39.294608+6.521086 
## [161]    train-rmse:13.628794+0.578398   test-rmse:39.172874+6.546034 
## [181]    train-rmse:12.582638+0.511526   test-rmse:39.105073+6.556987 
## [201]    train-rmse:11.711077+0.476856   test-rmse:39.092578+6.584332 
## [221]    train-rmse:10.865240+0.435333   test-rmse:39.029026+6.573390 
## [241]    train-rmse:10.102985+0.420194   test-rmse:39.005012+6.583835 
## [261]    train-rmse:9.417695+0.386754    test-rmse:38.986637+6.555736 
## [281]    train-rmse:8.759138+0.333634    test-rmse:38.990313+6.560016 
## [301]    train-rmse:8.236156+0.329632    test-rmse:38.965035+6.564122 
## [321]    train-rmse:7.678067+0.284506    test-rmse:38.941227+6.561929 
## [341]    train-rmse:7.174577+0.263691    test-rmse:38.913617+6.572939 
## [361]    train-rmse:6.674457+0.243478    test-rmse:38.888334+6.561878 
## Stopping. Best iteration:
## [358]    train-rmse:6.742673+0.243088    test-rmse:38.887928+6.561904
## 
## [1]  train-rmse:155.540935+0.988894  test-rmse:155.821506+4.448783 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:32.089768+0.334097   test-rmse:47.733685+6.892087 
## [41] train-rmse:17.782253+0.390725   test-rmse:41.852103+6.991691 
## [61] train-rmse:13.925996+0.451093   test-rmse:40.981263+6.764888 
## [81] train-rmse:11.372034+0.352159   test-rmse:40.708358+6.692603 
## [101]    train-rmse:9.366618+0.311467    test-rmse:40.528106+6.652575 
## [121]    train-rmse:7.828062+0.282044    test-rmse:40.474379+6.633743 
## [141]    train-rmse:6.567936+0.169459    test-rmse:40.410489+6.623649 
## [161]    train-rmse:5.551714+0.175513    test-rmse:40.388019+6.624430 
## [181]    train-rmse:4.725067+0.182829    test-rmse:40.382295+6.618079 
## Stopping. Best iteration:
## [167]    train-rmse:5.289942+0.156780    test-rmse:40.373794+6.625056
## 
## [1]  train-rmse:155.513517+0.987498  test-rmse:155.866687+4.438155 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:27.751263+0.188949   test-rmse:47.846481+6.046332 
## [41] train-rmse:8.803718+0.157587    test-rmse:41.338966+5.158538 
## [61] train-rmse:5.274136+0.104530    test-rmse:40.788000+4.842424 
## [81] train-rmse:3.563999+0.174723    test-rmse:40.652491+4.785510 
## [101]    train-rmse:2.401036+0.198387    test-rmse:40.591708+4.769054 
## [121]    train-rmse:1.685683+0.176099    test-rmse:40.578015+4.750616 
## [141]    train-rmse:1.144809+0.105795    test-rmse:40.577573+4.753814 
## Stopping. Best iteration:
## [138]    train-rmse:1.212141+0.111338    test-rmse:40.575250+4.752949
## 
## [1]  train-rmse:155.513134+0.987639  test-rmse:155.869106+4.436871 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:26.662322+0.162039   test-rmse:48.713198+6.497648 
## [41] train-rmse:5.869685+0.088411    test-rmse:41.783150+5.638744 
## [61] train-rmse:1.631045+0.067167    test-rmse:41.371784+5.434476 
## [81] train-rmse:0.587690+0.035217    test-rmse:41.279417+5.357672 
## [101]    train-rmse:0.261959+0.013676    test-rmse:41.234058+5.315414 
## [121]    train-rmse:0.134977+0.015741    test-rmse:41.217986+5.303442 
## [141]    train-rmse:0.070989+0.008293    test-rmse:41.212585+5.299911 
## [161]    train-rmse:0.035268+0.004723    test-rmse:41.210424+5.298446 
## [181]    train-rmse:0.018618+0.002794    test-rmse:41.209721+5.297941 
## [201]    train-rmse:0.009102+0.001996    test-rmse:41.209539+5.297960 
## [221]    train-rmse:0.004379+0.001168    test-rmse:41.209473+5.298011 
## [241]    train-rmse:0.002085+0.000639    test-rmse:41.209428+5.297991 
## [261]    train-rmse:0.001098+0.000238    test-rmse:41.209408+5.297987 
## [281]    train-rmse:0.000901+0.000061    test-rmse:41.209402+5.297987 
## Stopping. Best iteration:
## [274]    train-rmse:0.000901+0.000061    test-rmse:41.209402+5.297987
## 
## [1]  train-rmse:155.923219+0.995006  test-rmse:156.025261+4.444763 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:45.261486+0.576828   test-rmse:50.279596+5.122574 
## [41] train-rmse:35.505271+0.690523   test-rmse:43.618239+5.115975 
## [61] train-rmse:32.511876+0.557619   test-rmse:42.128886+4.986490 
## [81] train-rmse:30.439803+0.452820   test-rmse:41.370000+5.277682 
## [101]    train-rmse:28.903413+0.369530   test-rmse:40.932430+5.472931 
## [121]    train-rmse:27.527842+0.394657   test-rmse:40.674812+5.508553 
## [141]    train-rmse:26.340412+0.449682   test-rmse:40.480507+5.625782 
## [161]    train-rmse:25.316333+0.436525   test-rmse:40.250515+5.600797 
## [181]    train-rmse:24.359816+0.376062   test-rmse:40.044755+5.694525 
## [201]    train-rmse:23.536687+0.338762   test-rmse:39.836554+5.739841 
## [221]    train-rmse:22.728821+0.345782   test-rmse:39.700271+5.824222 
## [241]    train-rmse:22.034300+0.374043   test-rmse:39.615204+5.876605 
## [261]    train-rmse:21.335760+0.342067   test-rmse:39.523368+5.953505 
## [281]    train-rmse:20.759094+0.371213   test-rmse:39.462717+5.926652 
## [301]    train-rmse:20.169969+0.361070   test-rmse:39.432883+5.988091 
## [321]    train-rmse:19.644173+0.335691   test-rmse:39.308383+5.980166 
## [341]    train-rmse:19.084196+0.316821   test-rmse:39.207546+6.023201 
## [361]    train-rmse:18.613470+0.327446   test-rmse:39.186557+6.017137 
## [381]    train-rmse:18.160355+0.308557   test-rmse:39.152158+6.012429 
## [400]    train-rmse:17.742841+0.320447   test-rmse:39.093390+6.038325 
## [1]  train-rmse:155.663935+0.989397  test-rmse:155.961315+4.434324 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:37.518236+0.234517   test-rmse:47.686368+5.941638 
## [41] train-rmse:25.897748+0.353815   test-rmse:41.491408+5.804376 
## [61] train-rmse:22.196835+0.411861   test-rmse:40.165419+5.687282 
## [81] train-rmse:19.733499+0.516305   test-rmse:39.769057+5.626148 
## [101]    train-rmse:17.941086+0.492227   test-rmse:39.505729+5.515807 
## [121]    train-rmse:16.376199+0.494484   test-rmse:39.308983+5.535572 
## [141]    train-rmse:14.959684+0.382131   test-rmse:39.104045+5.526289 
## [161]    train-rmse:13.729137+0.291206   test-rmse:38.988209+5.471128 
## [181]    train-rmse:12.600224+0.277000   test-rmse:38.930072+5.479241 
## [201]    train-rmse:11.633413+0.330056   test-rmse:38.891789+5.498065 
## [221]    train-rmse:10.838557+0.307510   test-rmse:38.866125+5.514231 
## [241]    train-rmse:10.092801+0.298413   test-rmse:38.851638+5.511194 
## [261]    train-rmse:9.402903+0.283144    test-rmse:38.812336+5.505448 
## Stopping. Best iteration:
## [256]    train-rmse:9.569201+0.309314    test-rmse:38.807201+5.493952
## 
## [1]  train-rmse:155.573313+0.991363  test-rmse:155.896028+4.448180 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:32.514414+0.368874   test-rmse:47.358893+6.308671 
## [41] train-rmse:17.839060+0.453235   test-rmse:41.641008+5.936443 
## [61] train-rmse:14.144815+0.432039   test-rmse:40.940547+5.804612 
## [81] train-rmse:11.572142+0.299441   test-rmse:40.704365+5.735263 
## [101]    train-rmse:9.664845+0.087070    test-rmse:40.570333+5.796825 
## [121]    train-rmse:8.228081+0.154941    test-rmse:40.476992+5.795857 
## [141]    train-rmse:7.119070+0.112683    test-rmse:40.450593+5.813408 
## [161]    train-rmse:6.166064+0.103957    test-rmse:40.429551+5.839250 
## [181]    train-rmse:5.327917+0.114596    test-rmse:40.416814+5.839338 
## [201]    train-rmse:4.500785+0.076409    test-rmse:40.397115+5.854001 
## [221]    train-rmse:3.851500+0.085858    test-rmse:40.386394+5.870107 
## [241]    train-rmse:3.319453+0.113850    test-rmse:40.385897+5.862929 
## Stopping. Best iteration:
## [229]    train-rmse:3.647222+0.106560    test-rmse:40.378329+5.865510
## 
## [1]  train-rmse:155.549737+0.989554  test-rmse:155.936311+4.452184 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:28.332592+0.137265   test-rmse:47.199421+6.344402 
## [41] train-rmse:9.796114+0.453455    test-rmse:41.442207+6.089064 
## [61] train-rmse:6.568337+0.685001    test-rmse:41.037371+5.973341 
## [81] train-rmse:5.076299+0.559379    test-rmse:40.953091+5.987056 
## [101]    train-rmse:3.900479+0.520891    test-rmse:40.925124+6.009822 
## [121]    train-rmse:2.977942+0.611258    test-rmse:40.894707+6.015241 
## [141]    train-rmse:2.347657+0.661378    test-rmse:40.888336+6.027346 
## [161]    train-rmse:1.753699+0.675078    test-rmse:40.881569+6.016826 
## [181]    train-rmse:1.346052+0.632964    test-rmse:40.870926+6.014248 
## [201]    train-rmse:1.043443+0.560873    test-rmse:40.858707+6.006834 
## [221]    train-rmse:0.831430+0.476842    test-rmse:40.864434+6.011661 
## Stopping. Best iteration:
## [209]    train-rmse:0.954477+0.532639    test-rmse:40.857884+6.004300
## 
## [1]  train-rmse:155.549373+0.989849  test-rmse:155.936358+4.452212 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:27.391438+0.175560   test-rmse:47.506820+6.602628 
## [41] train-rmse:6.935040+0.245195    test-rmse:41.943775+6.172862 
## [61] train-rmse:3.128819+0.492549    test-rmse:41.666054+6.005776 
## [81] train-rmse:2.040610+0.478121    test-rmse:41.643736+5.980612 
## [101]    train-rmse:1.496994+0.419558    test-rmse:41.614197+5.956891 
## [121]    train-rmse:1.143136+0.359897    test-rmse:41.609170+5.961252 
## [141]    train-rmse:0.860449+0.286058    test-rmse:41.593978+5.950864 
## [161]    train-rmse:0.638717+0.191981    test-rmse:41.573003+5.940876 
## [181]    train-rmse:0.461626+0.138180    test-rmse:41.556177+5.934262 
## [201]    train-rmse:0.333635+0.101461    test-rmse:41.546124+5.937699 
## [221]    train-rmse:0.245112+0.075627    test-rmse:41.532338+5.934505 
## [241]    train-rmse:0.178459+0.057546    test-rmse:41.526864+5.934436 
## [261]    train-rmse:0.126095+0.040577    test-rmse:41.524725+5.936657 
## [281]    train-rmse:0.090937+0.032245    test-rmse:41.525127+5.940141 
## Stopping. Best iteration:
## [263]    train-rmse:0.121731+0.038786    test-rmse:41.523801+5.935661
## 
## [1]  train-rmse:155.923219+0.995006  test-rmse:156.025261+4.444763 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:45.668214+0.784018   test-rmse:50.225266+5.309379 
## [41] train-rmse:36.151563+0.927362   test-rmse:43.454034+5.095706 
## [61] train-rmse:33.201758+0.762540   test-rmse:41.816532+5.078728 
## [81] train-rmse:31.020702+0.619584   test-rmse:41.141445+5.193476 
## [101]    train-rmse:29.458874+0.529671   test-rmse:40.745794+5.290664 
## [121]    train-rmse:28.194370+0.442708   test-rmse:40.415710+5.321824 
## [141]    train-rmse:27.046179+0.462530   test-rmse:40.247833+5.352164 
## [161]    train-rmse:25.956812+0.535022   test-rmse:40.090386+5.441841 
## [181]    train-rmse:24.969629+0.489796   test-rmse:39.838965+5.472727 
## [201]    train-rmse:24.190740+0.429843   test-rmse:39.725495+5.527306 
## [221]    train-rmse:23.385090+0.392902   test-rmse:39.631204+5.590598 
## [241]    train-rmse:22.641646+0.270584   test-rmse:39.526910+5.635503 
## [261]    train-rmse:21.973537+0.228644   test-rmse:39.381268+5.695521 
## [281]    train-rmse:21.386466+0.221521   test-rmse:39.291151+5.736197 
## [301]    train-rmse:20.778860+0.220350   test-rmse:39.280137+5.867370 
## [321]    train-rmse:20.229012+0.209733   test-rmse:39.199894+5.847261 
## [341]    train-rmse:19.707310+0.195022   test-rmse:39.195720+5.851529 
## [361]    train-rmse:19.194668+0.184907   test-rmse:39.136194+5.924410 
## [381]    train-rmse:18.707829+0.191766   test-rmse:39.140376+5.947670 
## [400]    train-rmse:18.265134+0.194705   test-rmse:39.095356+5.939024 
## [1]  train-rmse:155.669957+0.992305  test-rmse:155.865894+4.419190 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:38.156101+0.437160   test-rmse:47.236984+5.563532 
## [41] train-rmse:26.675235+0.447150   test-rmse:40.709864+5.166045 
## [61] train-rmse:23.194599+0.381905   test-rmse:39.677992+5.215275 
## [81] train-rmse:20.757428+0.418509   test-rmse:39.128852+5.300222 
## [101]    train-rmse:18.857263+0.465870   test-rmse:38.795034+5.482592 
## [121]    train-rmse:17.142871+0.385091   test-rmse:38.575392+5.474923 
## [141]    train-rmse:15.648157+0.414026   test-rmse:38.411545+5.562134 
## [161]    train-rmse:14.409243+0.327026   test-rmse:38.267741+5.535385 
## [181]    train-rmse:13.325006+0.323475   test-rmse:38.228548+5.545185 
## [201]    train-rmse:12.372816+0.320893   test-rmse:38.129854+5.564059 
## [221]    train-rmse:11.476654+0.250907   test-rmse:38.091468+5.550118 
## [241]    train-rmse:10.691176+0.193056   test-rmse:38.039471+5.557781 
## [261]    train-rmse:9.967667+0.144746    test-rmse:38.031159+5.562501 
## [281]    train-rmse:9.231921+0.123225    test-rmse:37.979834+5.551849 
## [301]    train-rmse:8.627938+0.116817    test-rmse:37.975413+5.547636 
## [321]    train-rmse:8.048858+0.124121    test-rmse:37.961351+5.538837 
## [341]    train-rmse:7.562784+0.100356    test-rmse:37.938533+5.520537 
## [361]    train-rmse:7.060562+0.045791    test-rmse:37.929285+5.502289 
## [381]    train-rmse:6.623472+0.045477    test-rmse:37.910498+5.491523 
## [400]    train-rmse:6.232936+0.052304    test-rmse:37.934381+5.489826 
## [1]  train-rmse:155.579931+0.994539  test-rmse:155.788298+4.425868 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:33.472561+0.401728   test-rmse:46.688645+6.019535 
## [41] train-rmse:19.937952+0.467959   test-rmse:40.740099+4.993142 
## [61] train-rmse:16.920234+0.456917   test-rmse:39.880380+4.685706 
## [81] train-rmse:14.864575+0.250844   test-rmse:39.558290+4.604239 
## [101]    train-rmse:12.712289+0.493108   test-rmse:39.423447+4.608295 
## [121]    train-rmse:11.023812+0.791937   test-rmse:39.360087+4.558788 
## [141]    train-rmse:9.567143+1.039922    test-rmse:39.318630+4.531322 
## [161]    train-rmse:8.365546+1.182578    test-rmse:39.271873+4.518436 
## [181]    train-rmse:7.265670+1.222136    test-rmse:39.241595+4.482621 
## [201]    train-rmse:6.341937+1.073613    test-rmse:39.230275+4.476901 
## [221]    train-rmse:5.561561+0.959691    test-rmse:39.219937+4.457194 
## [241]    train-rmse:4.762611+0.818945    test-rmse:39.202938+4.449654 
## [261]    train-rmse:4.091788+0.707099    test-rmse:39.195238+4.473358 
## [281]    train-rmse:3.560884+0.594243    test-rmse:39.193683+4.479715 
## Stopping. Best iteration:
## [274]    train-rmse:3.735931+0.650731    test-rmse:39.188524+4.474571
## 
## [1]  train-rmse:155.558472+0.991403  test-rmse:155.815540+4.427591 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:29.578846+0.389290   test-rmse:46.412746+5.813459 
## [41] train-rmse:12.665625+0.554499   test-rmse:40.139243+4.813538 
## [61] train-rmse:9.845133+0.616631    test-rmse:39.647708+4.593992 
## [81] train-rmse:8.359111+0.691093    test-rmse:39.550131+4.558801 
## [101]    train-rmse:7.110555+0.746141    test-rmse:39.536738+4.561098 
## [121]    train-rmse:6.101495+0.674193    test-rmse:39.517281+4.620117 
## [141]    train-rmse:5.299083+0.607525    test-rmse:39.511690+4.642082 
## [161]    train-rmse:4.499875+0.558667    test-rmse:39.518928+4.656730 
## Stopping. Best iteration:
## [149]    train-rmse:4.975911+0.570371    test-rmse:39.507862+4.658671
## 
## [1]  train-rmse:155.558251+0.991538  test-rmse:155.815603+4.427637 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:28.786422+0.435636   test-rmse:46.482581+5.811310 
## [41] train-rmse:10.001607+0.429646   test-rmse:40.123257+5.070857 
## [61] train-rmse:6.587872+0.564184    test-rmse:39.708882+4.818365 
## [81] train-rmse:5.136960+0.528998    test-rmse:39.585598+4.739122 
## [101]    train-rmse:4.076033+0.414160    test-rmse:39.595872+4.751275 
## Stopping. Best iteration:
## [84] train-rmse:4.965934+0.536462    test-rmse:39.576262+4.745602
## 
## [1]  train-rmse:155.923219+0.995006  test-rmse:156.025261+4.444763 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:45.797761+0.761960   test-rmse:49.911694+5.231347 
## [41] train-rmse:36.536388+0.892651   test-rmse:43.257002+5.136335 
## [61] train-rmse:33.684870+0.873037   test-rmse:41.882848+5.017583 
## [81] train-rmse:31.626561+0.639948   test-rmse:41.132896+5.048876 
## [101]    train-rmse:30.102464+0.550477   test-rmse:40.703755+5.032130 
## [121]    train-rmse:28.773654+0.513194   test-rmse:40.416854+5.208977 
## [141]    train-rmse:27.608952+0.564818   test-rmse:40.266063+5.289847 
## [161]    train-rmse:26.515930+0.602499   test-rmse:40.018540+5.331219 
## [181]    train-rmse:25.538550+0.568406   test-rmse:39.788404+5.294486 
## [201]    train-rmse:24.694532+0.614686   test-rmse:39.655583+5.279949 
## [221]    train-rmse:23.873289+0.565829   test-rmse:39.551840+5.276938 
## [241]    train-rmse:23.130079+0.502307   test-rmse:39.418876+5.290807 
## [261]    train-rmse:22.489764+0.425688   test-rmse:39.327120+5.242715 
## [281]    train-rmse:21.766488+0.417183   test-rmse:39.286420+5.300798 
## [301]    train-rmse:21.155909+0.355048   test-rmse:39.141280+5.325774 
## [321]    train-rmse:20.569280+0.358101   test-rmse:39.085246+5.404920 
## [341]    train-rmse:20.031969+0.340610   test-rmse:39.047630+5.402351 
## [361]    train-rmse:19.503017+0.306332   test-rmse:38.981530+5.416495 
## [381]    train-rmse:19.049457+0.298042   test-rmse:38.948432+5.450925 
## [400]    train-rmse:18.639680+0.270735   test-rmse:38.898228+5.451354 
## [1]  train-rmse:155.674421+0.996358  test-rmse:155.809973+4.477781 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:38.827184+0.629267   test-rmse:46.955373+5.258361 
## [41] train-rmse:27.880962+0.761878   test-rmse:41.236741+4.684969 
## [61] train-rmse:24.312502+0.757889   test-rmse:40.292237+4.651939 
## [81] train-rmse:21.900595+0.676296   test-rmse:39.870986+4.697067 
## [101]    train-rmse:20.002443+0.516896   test-rmse:39.574537+4.751862 
## [121]    train-rmse:18.297759+0.469001   test-rmse:39.385012+4.779989 
## [141]    train-rmse:16.841058+0.452940   test-rmse:39.266976+4.855041 
## [161]    train-rmse:15.677410+0.381546   test-rmse:39.126837+4.844301 
## [181]    train-rmse:14.565879+0.361111   test-rmse:39.036801+4.842369 
## [201]    train-rmse:13.592803+0.352575   test-rmse:39.011461+4.862925 
## Stopping. Best iteration:
## [190]    train-rmse:14.106106+0.336381   test-rmse:38.977904+4.836467
## 
## [1]  train-rmse:155.583823+0.998974  test-rmse:155.749185+4.490540 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:34.477207+0.576108   test-rmse:46.183847+5.509669 
## [41] train-rmse:21.693379+0.733534   test-rmse:40.386987+4.591050 
## [61] train-rmse:19.019088+0.591381   test-rmse:39.819377+4.397270 
## [81] train-rmse:16.951870+0.618128   test-rmse:39.525553+4.361783 
## [101]    train-rmse:15.181216+0.767454   test-rmse:39.350090+4.458203 
## [121]    train-rmse:13.627316+0.773530   test-rmse:39.228005+4.422149 
## [141]    train-rmse:11.951699+0.790139   test-rmse:39.167303+4.416682 
## [161]    train-rmse:10.656632+0.941635   test-rmse:39.085551+4.430813 
## Stopping. Best iteration:
## [156]    train-rmse:10.924023+0.950933   test-rmse:39.069447+4.415372
## 
## [1]  train-rmse:155.564952+0.994189  test-rmse:155.764182+4.489579 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:31.125694+0.690405   test-rmse:46.230963+5.412474 
## [41] train-rmse:15.778394+0.983752   test-rmse:40.739257+4.570697 
## [61] train-rmse:13.021589+0.960713   test-rmse:40.336485+4.373620 
## [81] train-rmse:11.375103+0.936152   test-rmse:40.208827+4.315567 
## [101]    train-rmse:9.845662+0.932476    test-rmse:40.192703+4.223233 
## [121]    train-rmse:8.596861+0.879497    test-rmse:40.155396+4.164202 
## [141]    train-rmse:7.522202+0.848116    test-rmse:40.161833+4.118405 
## Stopping. Best iteration:
## [125]    train-rmse:8.348581+0.839370    test-rmse:40.140748+4.172937
## 
## [1]  train-rmse:155.564731+0.994324  test-rmse:155.764245+4.489624 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:30.294237+0.638722   test-rmse:46.183998+5.610968 
## [41] train-rmse:13.006289+0.931179   test-rmse:40.434460+4.693843 
## [61] train-rmse:9.633774+0.975980    test-rmse:40.093409+4.456081 
## [81] train-rmse:7.725633+0.823563    test-rmse:40.051690+4.355386 
## [101]    train-rmse:6.285111+0.645862    test-rmse:40.047207+4.267359 
## Stopping. Best iteration:
## [93] train-rmse:6.827789+0.706825    test-rmse:40.001798+4.318138
## 
## [1]  train-rmse:155.923219+0.995006  test-rmse:156.025261+4.444763 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:46.011139+0.818664   test-rmse:49.858349+5.224773 
## [41] train-rmse:37.036852+0.856986   test-rmse:42.994131+4.815924 
## [61] train-rmse:34.099045+0.753701   test-rmse:41.564434+4.846370 
## [81] train-rmse:32.127938+0.628897   test-rmse:41.023944+4.936077 
## [101]    train-rmse:30.562584+0.518482   test-rmse:40.607200+4.981226 
## [121]    train-rmse:29.281677+0.459487   test-rmse:40.322825+5.095042 
## [141]    train-rmse:28.104133+0.477407   test-rmse:40.076025+5.121894 
## [161]    train-rmse:27.095146+0.519780   test-rmse:39.886026+5.137129 
## [181]    train-rmse:26.122661+0.539479   test-rmse:39.681715+5.269441 
## [201]    train-rmse:25.302088+0.461277   test-rmse:39.506076+5.360436 
## [221]    train-rmse:24.534303+0.497842   test-rmse:39.424825+5.455917 
## [241]    train-rmse:23.765040+0.494997   test-rmse:39.312331+5.471052 
## [261]    train-rmse:23.062301+0.486171   test-rmse:39.248629+5.533508 
## [281]    train-rmse:22.390743+0.447190   test-rmse:39.159901+5.593207 
## [301]    train-rmse:21.796373+0.444378   test-rmse:39.147849+5.593468 
## Stopping. Best iteration:
## [289]    train-rmse:22.158002+0.470632   test-rmse:39.121658+5.575967
## 
## [1]  train-rmse:155.683375+0.997403  test-rmse:155.808269+4.475182 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:39.635075+0.710631   test-rmse:46.588129+5.385014 
## [41] train-rmse:28.944053+0.770717   test-rmse:40.644772+4.840380 
## [61] train-rmse:25.444153+0.685129   test-rmse:39.840403+4.863084 
## [81] train-rmse:23.125955+0.616136   test-rmse:39.430465+4.956810 
## [101]    train-rmse:21.428984+0.456083   test-rmse:39.217941+4.977984 
## [121]    train-rmse:19.770029+0.320270   test-rmse:39.061620+5.030205 
## [141]    train-rmse:18.446977+0.268956   test-rmse:38.880250+4.986119 
## [161]    train-rmse:17.173013+0.268071   test-rmse:38.781579+5.053973 
## [181]    train-rmse:16.026375+0.199319   test-rmse:38.698667+5.017995 
## [201]    train-rmse:15.061067+0.149424   test-rmse:38.643541+5.005422 
## [221]    train-rmse:14.185130+0.115322   test-rmse:38.624761+5.014609 
## [241]    train-rmse:13.328543+0.196200   test-rmse:38.613769+5.041800 
## [261]    train-rmse:12.551490+0.197290   test-rmse:38.566381+5.011979 
## [281]    train-rmse:11.757264+0.193628   test-rmse:38.545202+5.038519 
## Stopping. Best iteration:
## [269]    train-rmse:12.224207+0.214599   test-rmse:38.523666+5.030573
## 
## [1]  train-rmse:155.593576+0.999524  test-rmse:155.729175+4.482497 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:35.794634+0.761135   test-rmse:45.915867+5.097474 
## [41] train-rmse:23.877213+0.931349   test-rmse:40.332650+4.588628 
## [61] train-rmse:21.134434+0.875543   test-rmse:39.760550+4.340987 
## [81] train-rmse:18.985846+0.889907   test-rmse:39.503112+4.297516 
## [101]    train-rmse:17.274876+0.846141   test-rmse:39.278254+4.298143 
## [121]    train-rmse:15.725528+0.527950   test-rmse:39.183504+4.324950 
## [141]    train-rmse:14.338271+0.409428   test-rmse:39.108741+4.320694 
## [161]    train-rmse:13.119541+0.505128   test-rmse:39.003558+4.281511 
## [181]    train-rmse:11.939009+0.679259   test-rmse:38.928333+4.271268 
## [201]    train-rmse:10.968982+0.891851   test-rmse:38.919110+4.279946 
## [221]    train-rmse:10.028509+1.052103   test-rmse:38.872795+4.275636 
## [241]    train-rmse:9.167648+1.184751    test-rmse:38.854964+4.232894 
## [261]    train-rmse:8.271626+1.240259    test-rmse:38.818084+4.218643 
## [281]    train-rmse:7.484287+1.278640    test-rmse:38.812453+4.197328 
## Stopping. Best iteration:
## [266]    train-rmse:8.066380+1.224079    test-rmse:38.800851+4.213550
## 
## [1]  train-rmse:155.575698+0.994179  test-rmse:155.743789+4.479039 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:32.917626+0.654092   test-rmse:45.382986+5.229377 
## [41] train-rmse:18.935301+0.993680   test-rmse:39.609924+4.670923 
## [61] train-rmse:16.016446+0.942589   test-rmse:39.217724+4.358806 
## [81] train-rmse:14.261695+0.907734   test-rmse:39.165765+4.275564 
## [101]    train-rmse:12.749838+0.964577   test-rmse:39.064697+4.259235 
## [121]    train-rmse:11.454316+0.970915   test-rmse:39.032333+4.256497 
## [141]    train-rmse:10.377982+0.909442   test-rmse:38.986465+4.251055 
## [161]    train-rmse:9.379088+0.895431    test-rmse:38.962029+4.235906 
## Stopping. Best iteration:
## [156]    train-rmse:9.598302+0.910826    test-rmse:38.942701+4.250798
## 
## [1]  train-rmse:155.575477+0.994313  test-rmse:155.743852+4.479084 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:32.184717+0.555761   test-rmse:45.327270+5.300231 
## [41] train-rmse:16.558327+0.930376   test-rmse:39.403841+4.823431 
## [61] train-rmse:13.020733+0.827251   test-rmse:39.073049+4.581892 
## Stopping. Best iteration:
## [58] train-rmse:13.408920+0.805291   test-rmse:39.049898+4.605043
## 
## [1]  train-rmse:155.928510+0.994099  test-rmse:156.012588+4.424331 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:46.394729+0.785737   test-rmse:50.010573+5.141373 
## [41] train-rmse:37.477197+0.945086   test-rmse:43.292249+4.856590 
## [61] train-rmse:34.697950+0.926048   test-rmse:41.819547+4.791283 
## [81] train-rmse:32.824039+0.817514   test-rmse:41.095205+4.782919 
## [101]    train-rmse:31.405380+0.766240   test-rmse:40.706208+4.889656 
## [121]    train-rmse:30.151174+0.770466   test-rmse:40.505194+4.817776 
## [141]    train-rmse:29.031136+0.764227   test-rmse:40.209747+4.852588 
## [161]    train-rmse:28.033676+0.700861   test-rmse:39.980434+4.900808 
## [181]    train-rmse:27.047187+0.683060   test-rmse:39.805408+4.928373 
## [201]    train-rmse:26.251450+0.564662   test-rmse:39.677288+4.946854 
## [221]    train-rmse:25.482954+0.521290   test-rmse:39.528052+5.015162 
## [241]    train-rmse:24.788911+0.468435   test-rmse:39.454241+5.069034 
## [261]    train-rmse:24.076803+0.440437   test-rmse:39.385713+5.120202 
## [281]    train-rmse:23.431674+0.434120   test-rmse:39.303576+5.173130 
## [301]    train-rmse:22.851691+0.457692   test-rmse:39.247754+5.208020 
## [321]    train-rmse:22.310668+0.460469   test-rmse:39.209739+5.223890 
## [341]    train-rmse:21.799608+0.431278   test-rmse:39.144663+5.255303 
## [361]    train-rmse:21.304357+0.378111   test-rmse:39.171870+5.388337 
## Stopping. Best iteration:
## [355]    train-rmse:21.441731+0.398412   test-rmse:39.140591+5.372818
## 
## [1]  train-rmse:155.693463+0.994821  test-rmse:155.809231+4.455935 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:40.799635+1.036890   test-rmse:47.223533+5.156364 
## [41] train-rmse:31.015806+0.948111   test-rmse:41.434288+4.388744 
## [61] train-rmse:27.867882+0.804261   test-rmse:40.414331+4.274944 
## [81] train-rmse:25.584146+0.895710   test-rmse:40.020300+4.312466 
## [101]    train-rmse:23.646398+0.774060   test-rmse:39.772485+4.376203 
## [121]    train-rmse:22.095201+0.788582   test-rmse:39.558129+4.372716 
## [141]    train-rmse:20.546204+0.795319   test-rmse:39.469596+4.403057 
## [161]    train-rmse:19.395617+0.713909   test-rmse:39.407533+4.436031 
## [181]    train-rmse:18.193837+0.636075   test-rmse:39.184372+4.419624 
## [201]    train-rmse:17.003789+0.514772   test-rmse:39.080524+4.463708 
## [221]    train-rmse:16.070674+0.437204   test-rmse:39.004393+4.541454 
## [241]    train-rmse:15.166980+0.458443   test-rmse:38.925051+4.564741 
## [261]    train-rmse:14.310488+0.364218   test-rmse:38.889798+4.530142 
## [281]    train-rmse:13.546584+0.359430   test-rmse:38.824347+4.498698 
## [301]    train-rmse:12.819510+0.302203   test-rmse:38.774241+4.506507 
## [321]    train-rmse:12.195240+0.250993   test-rmse:38.763024+4.535178 
## [341]    train-rmse:11.585869+0.238547   test-rmse:38.728615+4.543407 
## [361]    train-rmse:10.980963+0.215037   test-rmse:38.685963+4.583869 
## [381]    train-rmse:10.454953+0.200367   test-rmse:38.682394+4.577352 
## Stopping. Best iteration:
## [376]    train-rmse:10.581299+0.205868   test-rmse:38.672607+4.577582
## 
## [1]  train-rmse:155.605220+0.997586  test-rmse:155.734620+4.462215 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:37.398383+0.910310   test-rmse:46.030590+5.122129 
## [41] train-rmse:26.451049+0.998985   test-rmse:40.119435+4.374975 
## [61] train-rmse:23.655319+0.942740   test-rmse:39.606841+4.309743 
## [81] train-rmse:21.649974+0.944753   test-rmse:39.486674+4.357037 
## [101]    train-rmse:19.941541+1.037673   test-rmse:39.312663+4.383887 
## [121]    train-rmse:18.446103+1.001197   test-rmse:39.190934+4.376996 
## [141]    train-rmse:17.046410+0.957040   test-rmse:39.159902+4.340655 
## [161]    train-rmse:15.823516+0.886960   test-rmse:39.160123+4.261785 
## [181]    train-rmse:14.640353+0.778228   test-rmse:39.098718+4.180354 
## [201]    train-rmse:13.517885+0.700988   test-rmse:39.117681+4.157102 
## Stopping. Best iteration:
## [189]    train-rmse:14.205256+0.852796   test-rmse:39.080676+4.144212
## 
## [1]  train-rmse:155.586684+0.993157  test-rmse:155.745685+4.457457 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:35.036128+0.954248   test-rmse:45.883272+5.086294 
## [41] train-rmse:22.448311+1.287448   test-rmse:39.840053+4.398604 
## [61] train-rmse:19.541090+1.340292   test-rmse:39.408062+4.206031 
## [81] train-rmse:17.510792+1.302728   test-rmse:39.437521+4.101296 
## Stopping. Best iteration:
## [63] train-rmse:19.307116+1.307286   test-rmse:39.389669+4.190030
## 
## [1]  train-rmse:155.586684+0.993157  test-rmse:155.745685+4.457457 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:34.443311+0.887727   test-rmse:45.755905+5.204352 
## [41] train-rmse:20.369186+1.240301   test-rmse:39.664915+4.679408 
## [61] train-rmse:16.754441+1.255015   test-rmse:39.297363+4.466601 
## [81] train-rmse:14.457300+1.230454   test-rmse:39.257343+4.394194 
## Stopping. Best iteration:
## [74] train-rmse:15.176265+1.255657   test-rmse:39.239298+4.377807
res_db <- cbind.data.frame(cv_params, rmse_vec)
names(res_db)[3] <- c("rmse") 
res_db$max_depth <- as.factor(res_db$max_depth) # Convert tree number to factor for plotting
res_db$min_child_weight <- as.factor(res_db$min_child_weight) # Convert node size to factor for plotting
# Print AUC heatmap
g_2 <- ggplot(res_db, aes(y = max_depth, x = min_child_weight, fill = rmse)) + # set aesthetics
  geom_tile() + # Use geom_tile for heatmap
  theme_bw() + # Set theme
  scale_fill_gradient2(low = "blue", # Choose low color
                       mid = "white", # Choose mid color
                       high = "red", # Choose high color
                       midpoint =mean(res_db$rmse), # Choose mid point
                       space = "Lab", 
                       na.value ="grey", # Choose NA value
                       guide = "colourbar", # Set color bar
                       aesthetics = "fill") + # Select aesthetics to apply
  labs(x = "Minimum Child Weight", y = "Max Depth", fill = "RMSE") # Set labels
g_2 # Generate plot

Interpretation This graph is a hyperparameter tuning heatmap for a machine learning model, and it helps identify the best combination of two key parameters—Maximum Depth and Minimum Child Weight—to minimize the RMSE (Root Mean Square Error), which measures prediction error.

Performance Regions (Color-coded RMSE):

Blue/Purple Areas: Indicate lower RMSE values, meaning better model performance. Red Areas: Indicate higher RMSE values, meaning worse performance. The goal is to find the darkest blue region, as this represents the optimal hyperparameter combination.

Optimal Region: 5 and 5

The dark blue area in the heatmap highlights the best-performing combination of Max Depth and Minimum Child Weight, where RMSE is lowest. This region is the sweet spot for balancing underfitting and overfitting.

This graph helps us select the optimal parameter values by visually pinpointing the area with the lowest RMSE (dark blue).

res_db[which.min(res_db$rmse),]
##    max_depth min_child_weight    rmse
## 12         5                5 37.9105
gamma_vals <- c(0, 0.05, 0.1, 0.15, 0.2) # Create vector of gamma values

# Be Careful - This can take a very long time to run
set.seed(111111)
rmse_vec  <- rep(NA, length(gamma_vals))
for(i in 1:length(gamma_vals)){
  bst_tune <- xgb.cv(data = dtrain, # Set training data
                     
                     nfold = 5, # Use 5 fold cross-validation
                     
                     eta = 0.1, # Set learning rate
                     max.depth = 5, # Set max depth
                     min_child_weight = 5, # Set minimum number of samples in node to split
                     gamma = gamma_vals[i], # Set minimum loss reduction for split
                     
                     
                     
                     nrounds = 100, # Set number of rounds
                     early_stopping_rounds = 20, # Set number of rounds to stop at if there is no improvement
                     
                     verbose = 1, # 1 - Prints out fit
                     nthread = 1, # Set number of parallel threads
                     print_every_n = 20 # Prints out result every 20th iteration
  ) # Set evaluation metric to use
  
  
  rmse_vec[i] <- bst_tune$evaluation_log$test_rmse_mean[bst_tune$best_ntreelimit]
  
  
}
## [1]  train-rmse:155.669957+0.992305  test-rmse:155.865894+4.419190 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:38.156101+0.437160   test-rmse:47.236984+5.563532 
## [41] train-rmse:26.675235+0.447150   test-rmse:40.709864+5.166045 
## [61] train-rmse:23.194599+0.381905   test-rmse:39.677992+5.215275 
## [81] train-rmse:20.757428+0.418509   test-rmse:39.128852+5.300222 
## [100]    train-rmse:18.921969+0.471757   test-rmse:38.798153+5.487136 
## [1]  train-rmse:155.624139+0.855612  test-rmse:155.946478+3.867198 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:37.873850+0.305590   test-rmse:48.366548+3.782038 
## [41] train-rmse:26.462077+0.311419   test-rmse:42.623345+2.699476 
## [61] train-rmse:22.999550+0.357038   test-rmse:42.000176+2.679805 
## [81] train-rmse:20.721396+0.347934   test-rmse:41.755204+2.666869 
## [100]    train-rmse:18.928444+0.218683   test-rmse:41.536280+2.716259 
## [1]  train-rmse:155.642011+0.757829  test-rmse:155.795694+3.283537 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:37.954876+0.326507   test-rmse:48.271183+4.289049 
## [41] train-rmse:26.480096+0.799600   test-rmse:42.900550+3.902017 
## [61] train-rmse:22.938072+0.717120   test-rmse:42.290639+3.982604 
## [81] train-rmse:20.452240+0.477754   test-rmse:41.903557+4.180795 
## [100]    train-rmse:18.774209+0.367113   test-rmse:41.789470+4.292675 
## [1]  train-rmse:155.635936+0.941595  test-rmse:155.836554+4.065533 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:38.077843+0.477146   test-rmse:47.794288+4.627716 
## [41] train-rmse:26.883747+0.547061   test-rmse:42.321650+4.922131 
## [61] train-rmse:23.237187+0.663856   test-rmse:41.752059+5.094904 
## [81] train-rmse:20.774524+0.519789   test-rmse:41.451020+5.218075 
## [100]    train-rmse:19.097906+0.412005   test-rmse:41.381768+5.361733 
## [1]  train-rmse:155.649032+0.896118  test-rmse:155.825481+3.924286 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:38.032362+0.535894   test-rmse:49.286317+2.789323 
## [41] train-rmse:26.416388+0.650058   test-rmse:43.439172+2.282346 
## [61] train-rmse:22.782243+0.519330   test-rmse:42.562767+2.655483 
## [81] train-rmse:20.432549+0.373585   test-rmse:42.167702+2.850239 
## [100]    train-rmse:18.791343+0.320954   test-rmse:41.961346+2.985648
# Lets view our results to identify the value of gamma to use:

# Gamma results
# Join gamma to values
cbind.data.frame(gamma_vals, rmse_vec)
##   gamma_vals rmse_vec
## 1       0.00 38.79815
## 2       0.05 41.53384
## 3       0.10 41.78238
## 4       0.15 41.36386
## 5       0.20 41.96135

Subsample and Column sample Tuning

subsample <- c(0.6, 0.7, 0.8, 0.9, 1) # Create vector of subsample values
colsample_by_tree <- c(0.6, 0.7, 0.8, 0.9, 1) # Create vector of col sample values

# Expand grid of tuning parameters
cv_params <- expand.grid(subsample, colsample_by_tree)
names(cv_params) <- c("subsample", "colsample_by_tree")
# Create vectors to store results
rmse_vec <- rep(NA, nrow(cv_params)) 
# Loop through parameter values
for(i in 1:nrow(cv_params)){
  set.seed(111111)
  bst_tune <- xgb.cv(data = dtrain, # Set training data
                     
                     nfold = 5, # Use 5 fold cross-validation
                     
                     eta = 0.1, # Set learning rate
                     max.depth = 5, # Set max depth
                     min_child_weight = 5, # Set minimum number of samples in node to split
                     gamma = 0, # Set minimum loss reduction for split
                     subsample = cv_params$subsample[i], # Set proportion of training data to use in tree
                     colsample_bytree = cv_params$colsample_by_tree[i], # Set number of variables to use in each tree
                     
                     nrounds = 150, # Set number of rounds
                     early_stopping_rounds = 20, # Set number of rounds to stop at if there is no improvement
                     
                     verbose = 1, # 1 - Prints out fit
                     nthread = 1, # Set number of parallel threads
                     print_every_n = 20 # Prints out result every 20th iteration
  ) # Set evaluation metric to use
  
  
  rmse_vec[i] <- bst_tune$evaluation_log$test_rmse_mean[bst_tune$best_ntreelimit]
  
  
}
## [1]  train-rmse:156.291580+1.134424  test-rmse:156.380547+4.490223 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:41.625678+1.188006   test-rmse:48.723618+5.732277 
## [41] train-rmse:30.270035+1.095867   test-rmse:41.603933+5.040977 
## [61] train-rmse:27.145651+0.948699   test-rmse:40.860320+5.189090 
## [81] train-rmse:24.942090+1.047179   test-rmse:40.506981+5.328928 
## [101]    train-rmse:23.122362+1.036449   test-rmse:39.998408+5.231731 
## [121]    train-rmse:21.547796+0.992947   test-rmse:39.864006+5.410231 
## [141]    train-rmse:20.212245+0.947153   test-rmse:39.607059+5.270484 
## [150]    train-rmse:19.580229+0.916530   test-rmse:39.486594+5.215443 
## [1]  train-rmse:156.180297+1.180874  test-rmse:156.274783+4.270194 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:41.012587+1.402185   test-rmse:48.392872+5.573266 
## [41] train-rmse:29.301820+1.389661   test-rmse:41.471292+4.935322 
## [61] train-rmse:25.998738+1.070476   test-rmse:40.474000+4.978077 
## [81] train-rmse:23.563668+0.989414   test-rmse:39.918954+5.126147 
## [101]    train-rmse:21.542258+0.860675   test-rmse:39.502628+5.085090 
## [121]    train-rmse:19.979315+0.930820   test-rmse:39.261521+5.160797 
## [141]    train-rmse:18.515159+0.954365   test-rmse:39.240929+5.233198 
## [150]    train-rmse:17.905418+0.878707   test-rmse:39.146248+5.217605 
## [1]  train-rmse:156.113181+1.114995  test-rmse:156.289350+4.380318 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:40.255881+0.998045   test-rmse:48.610496+5.957629 
## [41] train-rmse:28.220078+0.691427   test-rmse:41.577854+5.603400 
## [61] train-rmse:24.796127+0.746467   test-rmse:40.533496+5.588876 
## [81] train-rmse:22.328533+0.553579   test-rmse:39.979970+5.690993 
## [101]    train-rmse:20.340265+0.631716   test-rmse:39.661534+5.763079 
## [121]    train-rmse:18.716275+0.593216   test-rmse:39.475244+5.796814 
## [141]    train-rmse:17.324511+0.551708   test-rmse:39.302907+5.797290 
## [150]    train-rmse:16.684074+0.573041   test-rmse:39.206503+5.758951 
## [1]  train-rmse:156.064116+1.120800  test-rmse:156.141441+4.395624 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:39.809305+0.801021   test-rmse:48.330223+5.981505 
## [41] train-rmse:27.814969+0.621679   test-rmse:41.103771+5.335469 
## [61] train-rmse:24.113012+0.407792   test-rmse:40.047294+5.202767 
## [81] train-rmse:21.583854+0.397496   test-rmse:39.472346+5.133864 
## [101]    train-rmse:19.596170+0.287657   test-rmse:39.077661+5.149512 
## [121]    train-rmse:18.001721+0.375514   test-rmse:38.922964+5.132934 
## [141]    train-rmse:16.584435+0.262342   test-rmse:38.761365+5.144768 
## [150]    train-rmse:16.030579+0.264398   test-rmse:38.687578+5.177202 
## [1]  train-rmse:155.849898+0.907883  test-rmse:156.125241+4.589829 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:39.294708+0.481438   test-rmse:48.084461+6.134848 
## [41] train-rmse:26.975843+0.568916   test-rmse:40.918388+5.612634 
## [61] train-rmse:23.202392+0.651144   test-rmse:39.989423+5.456797 
## [81] train-rmse:20.810667+0.668823   test-rmse:39.560855+5.443846 
## [101]    train-rmse:19.026590+0.524716   test-rmse:39.203812+5.517627 
## [121]    train-rmse:17.462076+0.460771   test-rmse:39.022355+5.510305 
## [141]    train-rmse:16.184580+0.391956   test-rmse:38.897916+5.518509 
## [150]    train-rmse:15.646761+0.361256   test-rmse:38.830695+5.509801 
## [1]  train-rmse:156.177440+1.112329  test-rmse:156.214763+4.419398 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:41.054854+1.035014   test-rmse:47.900869+6.040799 
## [41] train-rmse:30.065247+1.029730   test-rmse:40.943314+5.606810 
## [61] train-rmse:27.261584+0.895330   test-rmse:40.268873+5.609198 
## [81] train-rmse:25.164523+0.950737   test-rmse:39.846401+5.531536 
## [101]    train-rmse:23.230802+0.973376   test-rmse:39.489493+5.396859 
## [121]    train-rmse:21.636450+1.037630   test-rmse:39.223737+5.469100 
## [141]    train-rmse:20.179279+1.087043   test-rmse:39.010269+5.606112 
## [150]    train-rmse:19.560585+1.097386   test-rmse:38.983560+5.518761 
## [1]  train-rmse:156.092013+1.191476  test-rmse:156.256243+4.235904 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:40.458992+0.856766   test-rmse:47.613359+5.793521 
## [41] train-rmse:29.188384+1.057993   test-rmse:40.931213+5.118927 
## [61] train-rmse:25.902044+0.783975   test-rmse:40.164865+5.208996 
## [81] train-rmse:23.637899+0.746726   test-rmse:39.693487+5.112769 
## [101]    train-rmse:21.705293+0.827438   test-rmse:39.451026+5.223210 
## [121]    train-rmse:19.855097+0.878348   test-rmse:39.192646+5.330260 
## [141]    train-rmse:18.400175+0.812008   test-rmse:39.057665+5.298968 
## [150]    train-rmse:17.671689+0.778241   test-rmse:38.998893+5.362957 
## [1]  train-rmse:155.991359+1.127262  test-rmse:156.161536+4.473588 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:39.740164+0.559403   test-rmse:47.401289+5.868959 
## [41] train-rmse:28.078753+0.581502   test-rmse:40.831795+5.029958 
## [61] train-rmse:24.642379+0.728251   test-rmse:39.933057+4.987196 
## [81] train-rmse:22.064594+0.652155   test-rmse:39.403365+5.130609 
## [101]    train-rmse:20.085370+0.495443   test-rmse:38.861509+4.989941 
## [121]    train-rmse:18.420895+0.494583   test-rmse:38.711803+4.985875 
## [141]    train-rmse:16.951364+0.395819   test-rmse:38.517433+5.056847 
## [150]    train-rmse:16.297066+0.366139   test-rmse:38.510253+5.092911 
## [1]  train-rmse:155.985737+1.145537  test-rmse:156.288793+4.357654 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:39.184214+0.469826   test-rmse:47.523167+5.813609 
## [41] train-rmse:27.399645+0.511137   test-rmse:40.624388+4.960100 
## [61] train-rmse:23.989185+0.448009   test-rmse:39.629873+4.862628 
## [81] train-rmse:21.449382+0.361444   test-rmse:39.167148+4.944653 
## [101]    train-rmse:19.439899+0.293436   test-rmse:38.829178+4.943255 
## [121]    train-rmse:17.761871+0.343214   test-rmse:38.545187+4.963868 
## [141]    train-rmse:16.384352+0.291314   test-rmse:38.395175+4.935563 
## [150]    train-rmse:15.782046+0.315989   test-rmse:38.322918+4.934728 
## [1]  train-rmse:155.839298+0.898715  test-rmse:156.149274+4.656773 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:38.547628+0.508129   test-rmse:47.447583+5.922893 
## [41] train-rmse:26.764550+0.710443   test-rmse:40.719245+5.115681 
## [61] train-rmse:23.378775+0.770796   test-rmse:39.808524+4.980537 
## [81] train-rmse:20.985002+0.756028   test-rmse:39.270668+4.985628 
## [101]    train-rmse:19.224153+0.813055   test-rmse:39.028684+5.054643 
## [121]    train-rmse:17.533188+0.665563   test-rmse:38.812500+5.106983 
## [141]    train-rmse:16.140011+0.633244   test-rmse:38.711704+5.129082 
## [150]    train-rmse:15.569447+0.565968   test-rmse:38.648896+5.136756 
## [1]  train-rmse:156.178377+1.113106  test-rmse:156.229209+4.422549 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:40.891827+0.901844   test-rmse:47.167286+5.151926 
## [41] train-rmse:30.142583+0.996419   test-rmse:40.656223+4.759258 
## [61] train-rmse:27.224142+0.900427   test-rmse:39.787144+4.770118 
## [81] train-rmse:25.130520+0.963399   test-rmse:39.321836+4.593772 
## [101]    train-rmse:23.454367+0.947753   test-rmse:38.948115+4.633573 
## [121]    train-rmse:21.904743+0.925526   test-rmse:38.842512+4.845019 
## [141]    train-rmse:20.470661+0.865073   test-rmse:38.695299+4.904461 
## [150]    train-rmse:19.817300+0.884533   test-rmse:38.659919+4.822291 
## [1]  train-rmse:156.084983+1.219448  test-rmse:156.195661+4.311254 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:40.121248+0.825825   test-rmse:47.451767+5.410110 
## [41] train-rmse:28.997394+0.921959   test-rmse:41.017012+4.925160 
## [61] train-rmse:26.018578+0.654137   test-rmse:40.160959+4.850924 
## [81] train-rmse:23.776847+0.798661   test-rmse:39.741252+4.830851 
## [101]    train-rmse:21.792616+0.681667   test-rmse:39.394424+4.926326 
## [121]    train-rmse:20.037946+0.694976   test-rmse:39.116832+5.036429 
## [141]    train-rmse:18.411468+0.632667   test-rmse:38.959986+5.159385 
## [150]    train-rmse:17.723150+0.594442   test-rmse:38.848474+5.277316 
## [1]  train-rmse:156.015442+1.174104  test-rmse:156.156272+4.425099 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:39.564629+0.528510   test-rmse:47.014312+6.059164 
## [41] train-rmse:28.031450+0.525119   test-rmse:40.837430+5.165862 
## [61] train-rmse:24.826333+0.625236   test-rmse:40.052544+5.071428 
## [81] train-rmse:22.184593+0.639494   test-rmse:39.506227+5.124452 
## [101]    train-rmse:20.163674+0.609068   test-rmse:39.181205+5.056354 
## [121]    train-rmse:18.311667+0.599425   test-rmse:39.000186+5.140381 
## [141]    train-rmse:16.837760+0.493834   test-rmse:38.780511+5.196465 
## [150]    train-rmse:16.139689+0.498764   test-rmse:38.742693+5.218155 
## [1]  train-rmse:155.982942+1.143467  test-rmse:156.301868+4.333073 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:39.085432+0.596514   test-rmse:47.650925+5.543978 
## [41] train-rmse:27.392445+0.440213   test-rmse:40.782148+4.576309 
## [61] train-rmse:23.765727+0.551476   test-rmse:39.647850+4.516121 
## [81] train-rmse:21.401628+0.400158   test-rmse:39.068174+4.456198 
## [101]    train-rmse:19.462099+0.412259   test-rmse:38.674099+4.456616 
## [121]    train-rmse:17.825618+0.313008   test-rmse:38.417274+4.527150 
## [141]    train-rmse:16.323121+0.289241   test-rmse:38.288630+4.547847 
## [150]    train-rmse:15.787460+0.324964   test-rmse:38.247348+4.530877 
## [1]  train-rmse:155.839187+0.898436  test-rmse:156.218077+4.704648 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:38.311036+0.450218   test-rmse:47.340778+5.850827 
## [41] train-rmse:26.714710+0.468377   test-rmse:40.870520+5.156912 
## [61] train-rmse:23.111174+0.647136   test-rmse:39.863700+5.036713 
## [81] train-rmse:20.751335+0.461089   test-rmse:39.332642+4.962572 
## [101]    train-rmse:19.014339+0.467169   test-rmse:39.063752+4.995765 
## [121]    train-rmse:17.434822+0.413803   test-rmse:38.886996+4.943162 
## [141]    train-rmse:16.124520+0.334463   test-rmse:38.755606+4.935025 
## [150]    train-rmse:15.524068+0.272693   test-rmse:38.694992+4.917269 
## [1]  train-rmse:156.112079+1.131794  test-rmse:156.231783+4.428030 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:40.631075+0.979470   test-rmse:47.184586+5.025250 
## [41] train-rmse:30.224822+1.194308   test-rmse:40.960456+4.533442 
## [61] train-rmse:27.373110+0.947946   test-rmse:40.264137+4.555948 
## [81] train-rmse:25.442543+0.957456   test-rmse:39.942953+4.446183 
## [101]    train-rmse:23.538515+0.927151   test-rmse:39.874912+4.375549 
## [121]    train-rmse:21.869966+1.077127   test-rmse:39.581849+4.362541 
## [141]    train-rmse:20.314956+1.009889   test-rmse:39.433813+4.321348 
## [150]    train-rmse:19.621685+0.964872   test-rmse:39.326409+4.321024 
## [1]  train-rmse:156.050601+1.214178  test-rmse:156.226310+4.438342 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:39.909838+0.913504   test-rmse:46.851218+4.986304 
## [41] train-rmse:28.961209+1.008499   test-rmse:40.436367+4.440323 
## [61] train-rmse:25.856636+0.940638   test-rmse:39.512138+4.439425 
## [81] train-rmse:23.720859+0.981385   test-rmse:38.942781+4.457746 
## [101]    train-rmse:21.771561+0.950106   test-rmse:38.581884+4.461963 
## [121]    train-rmse:19.895905+0.852775   test-rmse:38.353050+4.547435 
## [141]    train-rmse:18.254287+0.798754   test-rmse:38.227143+4.555186 
## [150]    train-rmse:17.573858+0.804880   test-rmse:38.083244+4.543580 
## [1]  train-rmse:156.007363+1.175878  test-rmse:156.118958+4.476477 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:39.181850+0.485702   test-rmse:46.755726+5.444674 
## [41] train-rmse:28.059356+0.709385   test-rmse:40.453957+4.701023 
## [61] train-rmse:24.701509+0.745351   test-rmse:39.701412+4.779918 
## [81] train-rmse:22.211652+0.881894   test-rmse:39.225590+4.790598 
## [101]    train-rmse:20.231272+0.886198   test-rmse:38.920089+4.825687 
## [121]    train-rmse:18.453885+0.767870   test-rmse:38.848609+4.881352 
## [141]    train-rmse:16.871992+0.656660   test-rmse:38.603432+4.941519 
## [150]    train-rmse:16.247109+0.679694   test-rmse:38.533410+4.963071 
## [1]  train-rmse:155.991614+1.124071  test-rmse:156.202968+4.372182 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:38.700786+0.431380   test-rmse:46.862356+5.359729 
## [41] train-rmse:27.259100+0.686857   test-rmse:40.894935+4.801975 
## [61] train-rmse:23.899220+0.753958   test-rmse:40.115942+4.944050 
## [81] train-rmse:21.319265+0.612384   test-rmse:39.702955+5.077506 
## [101]    train-rmse:19.359289+0.577397   test-rmse:39.379893+5.167841 
## [121]    train-rmse:17.710481+0.550666   test-rmse:39.122219+5.160412 
## [141]    train-rmse:16.207076+0.517892   test-rmse:39.010788+5.162200 
## [150]    train-rmse:15.606470+0.454306   test-rmse:38.975855+5.187639 
## [1]  train-rmse:155.756163+0.955411  test-rmse:155.939527+4.474408 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:38.218010+0.607671   test-rmse:46.845340+5.411159 
## [41] train-rmse:26.649823+0.476835   test-rmse:40.544062+4.546405 
## [61] train-rmse:22.965013+0.470372   test-rmse:39.404912+4.596457 
## [81] train-rmse:20.690756+0.480757   test-rmse:38.899079+4.532180 
## [101]    train-rmse:18.801046+0.280414   test-rmse:38.653309+4.571344 
## [121]    train-rmse:17.275738+0.188771   test-rmse:38.495048+4.578351 
## [141]    train-rmse:15.949040+0.168770   test-rmse:38.458078+4.598557 
## [150]    train-rmse:15.444665+0.153808   test-rmse:38.424424+4.622152 
## [1]  train-rmse:155.716033+1.041207  test-rmse:155.878949+4.240293 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:39.980271+0.899031   test-rmse:46.995177+4.864283 
## [41] train-rmse:29.881118+0.830316   test-rmse:40.672883+4.211619 
## [61] train-rmse:26.866022+0.816013   test-rmse:39.893607+4.319214 
## [81] train-rmse:24.849374+0.758049   test-rmse:39.353926+4.250231 
## [101]    train-rmse:23.086081+0.786834   test-rmse:39.135953+4.301608 
## [121]    train-rmse:21.586225+0.810182   test-rmse:39.039988+4.171739 
## [141]    train-rmse:19.943191+0.835024   test-rmse:38.910651+4.135550 
## [150]    train-rmse:19.284659+0.807842   test-rmse:38.773888+4.138759 
## [1]  train-rmse:155.697139+1.044981  test-rmse:155.894196+4.293842 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:39.361181+0.618975   test-rmse:47.148422+4.887731 
## [41] train-rmse:28.838144+0.951358   test-rmse:40.925137+3.766718 
## [61] train-rmse:25.810585+0.809332   test-rmse:40.142945+3.624474 
## [81] train-rmse:23.619256+0.848361   test-rmse:39.670041+3.679982 
## [101]    train-rmse:21.644541+0.893331   test-rmse:39.352504+3.763876 
## [121]    train-rmse:19.943548+0.910673   test-rmse:39.103856+3.683345 
## [141]    train-rmse:18.298481+0.858152   test-rmse:38.925187+3.697270 
## [150]    train-rmse:17.594802+0.846605   test-rmse:38.893378+3.760133 
## [1]  train-rmse:155.666892+0.993491  test-rmse:155.972399+4.366537 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:38.726839+0.755645   test-rmse:47.002576+4.969829 
## [41] train-rmse:27.874082+0.944789   test-rmse:40.804194+3.990876 
## [61] train-rmse:24.663644+0.886228   test-rmse:40.059195+4.004015 
## [81] train-rmse:22.272936+0.958236   test-rmse:39.640069+4.101683 
## [101]    train-rmse:20.204768+0.882501   test-rmse:39.233083+4.253777 
## [121]    train-rmse:18.396895+0.864541   test-rmse:39.105024+4.244420 
## [141]    train-rmse:16.822424+0.756017   test-rmse:38.893167+4.292641 
## [150]    train-rmse:16.129316+0.713102   test-rmse:38.830747+4.262617 
## [1]  train-rmse:155.643642+0.991466  test-rmse:155.784404+4.409067 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:38.472527+0.543640   test-rmse:46.960939+5.401652 
## [41] train-rmse:27.165821+0.655782   test-rmse:40.899146+4.883581 
## [61] train-rmse:23.517065+0.761286   test-rmse:39.929740+4.944964 
## [81] train-rmse:20.977950+0.772185   test-rmse:39.359333+5.086245 
## [101]    train-rmse:18.999433+0.561110   test-rmse:39.065539+5.207791 
## [121]    train-rmse:17.399469+0.539789   test-rmse:38.940123+5.339607 
## [141]    train-rmse:15.882419+0.519547   test-rmse:38.813482+5.340270 
## [150]    train-rmse:15.278418+0.426091   test-rmse:38.745651+5.310615 
## [1]  train-rmse:155.669957+0.992305  test-rmse:155.865894+4.419190 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:38.156101+0.437160   test-rmse:47.236984+5.563532 
## [41] train-rmse:26.675235+0.447150   test-rmse:40.709864+5.166045 
## [61] train-rmse:23.194599+0.381905   test-rmse:39.677992+5.215275 
## [81] train-rmse:20.757428+0.418509   test-rmse:39.128852+5.300222 
## [101]    train-rmse:18.857263+0.465870   test-rmse:38.795034+5.482592 
## [121]    train-rmse:17.142871+0.385091   test-rmse:38.575392+5.474923 
## [141]    train-rmse:15.648157+0.414026   test-rmse:38.411545+5.562134 
## [150]    train-rmse:15.049870+0.365411   test-rmse:38.323501+5.579656
# visualise tuning sample params

res_db <- cbind.data.frame(cv_params, rmse_vec)
names(res_db)[3] <- c("rmse") 
res_db$subsample <- as.factor(res_db$subsample) # Convert tree number to factor for plotting
res_db$colsample_by_tree <- as.factor(res_db$colsample_by_tree) # Convert node size to factor for plotting
g_4 <- ggplot(res_db, aes(y = colsample_by_tree, x = subsample, fill = rmse)) + # set aesthetics
  geom_tile() + # Use geom_tile for heatmap
  theme_bw() + # Set theme
  scale_fill_gradient2(low = "blue", # Choose low color
                       mid = "white", # Choose mid color
                       high = "red", # Choose high color
                       midpoint =mean(res_db$rmse), # Choose mid point
                       space = "Lab", 
                       na.value ="grey", # Choose NA value
                       guide = "colourbar", # Set color bar
                       aesthetics = "fill") + # Select aesthetics to apply
  labs(x = "Subsample", y = "Column Sample by Tree", fill = "RMSE") # Set labels
g_4 # Generate plot

optimal for subsample column 0.9 and 0.7

###### 4 - eta tuning ######

# Use xgb.cv to run cross-validation inside xgboost
set.seed(111111)
bst_mod_1 <- xgb.cv(data = dtrain, # Set training data
                    
                    nfold = 5, # Use 5 fold cross-validation
                    
                     eta = 0.3, # Set learning rate
                     max.depth = 5, # Set max depth
                     min_child_weight = 5, # Set minimum number of samples in node to split
                     gamma = 0, # Set minimum loss reduction for split
                     subsample = 0.7, # Set proportion of training data to use in tree
                     colsample_bytree = 0.9, # Set number of variables to use in each tree
                    
                    nrounds = 1000, # Set number of rounds
                    early_stopping_rounds = 20, # Set number of rounds to stop at if there is no improvement
                    
                    verbose = 1, # 1 - Prints out fit
                    nthread = 1, # Set number of parallel threads
                    print_every_n = 20 # Prints out result every 20th iteration
) # Set evaluation metric to use
## [1]  train-rmse:126.221429+1.726339  test-rmse:126.967395+4.729086 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:27.281582+0.886040   test-rmse:43.327927+4.965869 
## [41] train-rmse:21.582590+1.297152   test-rmse:42.803436+4.791390 
## [61] train-rmse:17.213924+1.095050   test-rmse:42.769312+4.934886 
## Stopping. Best iteration:
## [57] train-rmse:18.144053+1.170634   test-rmse:42.691267+4.772975
set.seed(111111)
bst_mod_2 <- xgb.cv(data = dtrain, # Set training data
                    
                    nfold = 5, # Use 5 fold cross-validation
                    
                    eta = 0.1, # Set learning rate
                max.depth = 5, # Set max depth
                     min_child_weight = 5, # Set minimum number of samples in node to split
                     gamma = 0, # Set minimum loss reduction for split
                     subsample = 0.7, # Set proportion of training data to use in tree
                     colsample_bytree = 0.9, # Set number of variables to use in each tree
                    
                    nrounds = 1000, # Set number of rounds
                    early_stopping_rounds = 20, # Set number of rounds to stop at if there is no improvement
                    
                    verbose = 1, # 1 - Prints out fit
                    nthread = 1, # Set number of parallel threads
                    print_every_n = 20 # Prints out result every 20th iteration
) # Set evaluation metric to use
## [1]  train-rmse:156.050601+1.214178  test-rmse:156.226310+4.438342 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:39.909838+0.913504   test-rmse:46.851218+4.986304 
## [41] train-rmse:28.961209+1.008499   test-rmse:40.436367+4.440323 
## [61] train-rmse:25.856636+0.940638   test-rmse:39.512138+4.439425 
## [81] train-rmse:23.720859+0.981385   test-rmse:38.942781+4.457746 
## [101]    train-rmse:21.771561+0.950106   test-rmse:38.581884+4.461963 
## [121]    train-rmse:19.895905+0.852775   test-rmse:38.353050+4.547435 
## [141]    train-rmse:18.254287+0.798754   test-rmse:38.227143+4.555186 
## [161]    train-rmse:16.863762+0.765064   test-rmse:38.027892+4.590511 
## [181]    train-rmse:15.539537+0.755156   test-rmse:38.066572+4.606022 
## Stopping. Best iteration:
## [162]    train-rmse:16.776628+0.782759   test-rmse:38.026948+4.578883
set.seed(111111)
bst_mod_3 <- xgb.cv(data = dtrain, # Set training data
                    
                    nfold = 5, # Use 5 fold cross-validation
                    
                    eta = 0.05, # Set learning rate
           max.depth = 5, # Set max depth
                     min_child_weight = 5, # Set minimum number of samples in node to split
                     gamma = 0, # Set minimum loss reduction for split
                     subsample = 0.7, # Set proportion of training data to use in tree
                     colsample_bytree = 0.9, # Set number of variables to use in each tree
                    
                    nrounds = 1000, # Set number of rounds
                    early_stopping_rounds = 20, # Set number of rounds to stop at if there is no improvement
                    
                    verbose = 1, # 1 - Prints out fit
                    nthread = 1, # Set number of parallel threads
                    print_every_n = 20 # Prints out result every 20th iteration
) # Set evaluation metric to use
## [1]  train-rmse:163.645596+1.136107  test-rmse:163.705952+4.384028 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:70.946041+0.724285   test-rmse:73.543190+5.349142 
## [41] train-rmse:40.944173+0.714031   test-rmse:47.714242+5.109761 
## [61] train-rmse:32.036698+0.718619   test-rmse:41.870618+4.629976 
## [81] train-rmse:28.750940+0.887176   test-rmse:40.332021+4.398836 
## [101]    train-rmse:26.989661+0.793043   test-rmse:39.685025+4.434232 
## [121]    train-rmse:25.746344+0.769723   test-rmse:39.417688+4.392644 
## [141]    train-rmse:24.574854+0.811988   test-rmse:39.228437+4.450826 
## [161]    train-rmse:23.496198+0.933040   test-rmse:39.068872+4.509305 
## [181]    train-rmse:22.508068+0.949237   test-rmse:38.852319+4.465521 
## [201]    train-rmse:21.652438+0.972101   test-rmse:38.766980+4.519673 
## [221]    train-rmse:20.799272+0.990975   test-rmse:38.647060+4.491595 
## [241]    train-rmse:19.956687+0.933110   test-rmse:38.505983+4.577817 
## [261]    train-rmse:19.160625+0.915361   test-rmse:38.402323+4.540959 
## [281]    train-rmse:18.377205+0.831631   test-rmse:38.341734+4.560897 
## [301]    train-rmse:17.658087+0.814335   test-rmse:38.276093+4.609224 
## [321]    train-rmse:16.948371+0.800184   test-rmse:38.212106+4.634375 
## [341]    train-rmse:16.261165+0.763589   test-rmse:38.142769+4.614524 
## [361]    train-rmse:15.584701+0.693656   test-rmse:38.084905+4.612309 
## [381]    train-rmse:14.949266+0.680382   test-rmse:38.048500+4.592930 
## [401]    train-rmse:14.365648+0.654907   test-rmse:38.054611+4.601365 
## Stopping. Best iteration:
## [385]    train-rmse:14.825310+0.681590   test-rmse:38.028778+4.596208
set.seed(111111)
bst_mod_4 <- xgb.cv(data = dtrain, # Set training data
                    
                    nfold = 5, # Use 5 fold cross-validation
                    
                    eta = 0.01, # Set learning rate
               max.depth = 5, # Set max depth
                     min_child_weight = 5, # Set minimum number of samples in node to split
                     gamma = 0, # Set minimum loss reduction for split
                     subsample = 0.7, # Set proportion of training data to use in tree
                     colsample_bytree = 0.9, # Set number of variables to use in each tree
                    
                    nrounds = 1000, # Set number of rounds
                    early_stopping_rounds = 20, # Set number of rounds to stop at if there is no improvement
                    
                    verbose = 1, # 1 - Prints out fit
                    nthread = 1, # Set number of parallel threads
                    print_every_n = 20 # Prints out result every 20th iteration
) # Set evaluation metric to use
## [1]  train-rmse:169.751058+1.089100  test-rmse:169.725113+4.346152 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:141.880723+0.955250  test-rmse:142.257888+4.498548 
## [41] train-rmse:119.167794+0.872740  test-rmse:120.037299+4.713734 
## [61] train-rmse:100.802743+0.811676  test-rmse:102.244318+4.944918 
## [81] train-rmse:85.894818+0.782803   test-rmse:87.992390+5.136235 
## [101]    train-rmse:73.884222+0.717277   test-rmse:76.688176+5.352523 
## [121]    train-rmse:64.235126+0.729433   test-rmse:67.782069+5.497611 
## [141]    train-rmse:56.555812+0.710863   test-rmse:60.908277+5.599919 
## [161]    train-rmse:50.471793+0.734623   test-rmse:55.655064+5.607419 
## [181]    train-rmse:45.661772+0.732638   test-rmse:51.709027+5.614634 
## [201]    train-rmse:41.887600+0.722793   test-rmse:48.730275+5.541621 
## [221]    train-rmse:38.932038+0.739646   test-rmse:46.558135+5.472146 
## [241]    train-rmse:36.590506+0.768168   test-rmse:44.884299+5.330835 
## [261]    train-rmse:34.744114+0.813147   test-rmse:43.700276+5.208201 
## [281]    train-rmse:33.284552+0.846326   test-rmse:42.825866+5.126308 
## [301]    train-rmse:32.135062+0.861412   test-rmse:42.131992+5.028858 
## [321]    train-rmse:31.221536+0.881145   test-rmse:41.607400+4.923748 
## [341]    train-rmse:30.439584+0.886590   test-rmse:41.192316+4.900387 
## [361]    train-rmse:29.804281+0.905998   test-rmse:40.909016+4.860042 
## [381]    train-rmse:29.241552+0.922932   test-rmse:40.680168+4.796410 
## [401]    train-rmse:28.721517+0.933025   test-rmse:40.468060+4.768594 
## [421]    train-rmse:28.283970+0.935860   test-rmse:40.272850+4.720182 
## [441]    train-rmse:27.885668+0.920283   test-rmse:40.127046+4.688938 
## [461]    train-rmse:27.533809+0.925783   test-rmse:39.995131+4.670988 
## [481]    train-rmse:27.215313+0.941093   test-rmse:39.903066+4.656156 
## [501]    train-rmse:26.885082+0.940984   test-rmse:39.795102+4.644757 
## [521]    train-rmse:26.586074+0.934100   test-rmse:39.693998+4.626631 
## [541]    train-rmse:26.307228+0.938589   test-rmse:39.609207+4.626136 
## [561]    train-rmse:26.025758+0.940028   test-rmse:39.529579+4.622719 
## [581]    train-rmse:25.746053+0.936777   test-rmse:39.442170+4.614326 
## [601]    train-rmse:25.502729+0.946358   test-rmse:39.381470+4.598905 
## [621]    train-rmse:25.248953+0.947486   test-rmse:39.312105+4.599686 
## [641]    train-rmse:24.998050+0.944837   test-rmse:39.245589+4.600753 
## [661]    train-rmse:24.778642+0.947764   test-rmse:39.191367+4.593372 
## [681]    train-rmse:24.539269+0.941668   test-rmse:39.117881+4.599828 
## [701]    train-rmse:24.321892+0.945048   test-rmse:39.066549+4.594213 
## [721]    train-rmse:24.098167+0.938395   test-rmse:39.012077+4.608622 
## [741]    train-rmse:23.887989+0.959588   test-rmse:38.985871+4.620076 
## [761]    train-rmse:23.694833+0.949804   test-rmse:38.947540+4.640001 
## [781]    train-rmse:23.474198+0.947959   test-rmse:38.884040+4.646983 
## [801]    train-rmse:23.272271+0.959769   test-rmse:38.829581+4.644860 
## [821]    train-rmse:23.078820+0.972193   test-rmse:38.800542+4.651635 
## [841]    train-rmse:22.878935+0.989104   test-rmse:38.773344+4.661049 
## [861]    train-rmse:22.680724+0.991005   test-rmse:38.739232+4.658809 
## [881]    train-rmse:22.490293+0.999787   test-rmse:38.709619+4.679306 
## [901]    train-rmse:22.306570+1.001152   test-rmse:38.676327+4.677275 
## [921]    train-rmse:22.123903+0.999008   test-rmse:38.645407+4.681627 
## [941]    train-rmse:21.943767+1.001066   test-rmse:38.615389+4.679911 
## [961]    train-rmse:21.761151+1.000074   test-rmse:38.567272+4.681637 
## [981]    train-rmse:21.582176+0.993373   test-rmse:38.554173+4.684477 
## [1000]   train-rmse:21.412550+0.996742   test-rmse:38.517828+4.694399
set.seed(111111)
bst_mod_5 <- xgb.cv(data = dtrain, # Set training data
                    
                    nfold = 5, # Use 5 fold cross-validation
                    
                    eta = 0.005, # Set learning rate
                   max.depth = 5, # Set max depth
                     min_child_weight = 5, # Set minimum number of samples in node to split
                     gamma = 0, # Set minimum loss reduction for split
                     subsample = 0.7, # Set proportion of training data to use in tree
                     colsample_bytree = 0.9, # Set number of variables to use in each tree
                    
                    nrounds = 1000, # Set number of rounds
                    early_stopping_rounds = 20, # Set number of rounds to stop at if there is no improvement
                    
                    verbose = 1, # 1 - Prints out fit
                    nthread = 1, # Set number of parallel threads
                    print_every_n = 20 # Prints out result every 20th iteration
                    
) # Set evaluation metric to use
## [1]  train-rmse:170.515927+1.084205  test-rmse:170.479538+4.341769 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 20 rounds.
## 
## [21] train-rmse:155.827283+1.014768  test-rmse:155.990434+4.410453 
## [41] train-rmse:142.537726+0.962567  test-rmse:142.924263+4.508570 
## [61] train-rmse:130.591073+0.925791  test-rmse:131.199487+4.605702 
## [81] train-rmse:119.789054+0.895809  test-rmse:120.669848+4.705063 
## [101]    train-rmse:110.056343+0.845651  test-rmse:111.196599+4.811344 
## [121]    train-rmse:101.275323+0.823743  test-rmse:102.666300+4.896093 
## [141]    train-rmse:93.393189+0.789487   test-rmse:95.099849+5.022794 
## [161]    train-rmse:86.321183+0.794908   test-rmse:88.338031+5.082084 
## [181]    train-rmse:79.951666+0.771745   test-rmse:82.326668+5.163812 
## [201]    train-rmse:74.251661+0.763005   test-rmse:76.992886+5.271831 
## [221]    train-rmse:69.158203+0.726369   test-rmse:72.264182+5.351938 
## [241]    train-rmse:64.579888+0.725927   test-rmse:68.076432+5.394121 
## [261]    train-rmse:60.479750+0.705619   test-rmse:64.392755+5.464637 
## [281]    train-rmse:56.845883+0.700854   test-rmse:61.167144+5.522503 
## [301]    train-rmse:53.602522+0.686929   test-rmse:58.307859+5.553678 
## [321]    train-rmse:50.719587+0.672038   test-rmse:55.839219+5.578217 
## [341]    train-rmse:48.167776+0.651343   test-rmse:53.715719+5.587653 
## [361]    train-rmse:45.891866+0.661654   test-rmse:51.861258+5.579469 
## [381]    train-rmse:43.881180+0.659681   test-rmse:50.250897+5.557247 
## [401]    train-rmse:42.082187+0.675744   test-rmse:48.839272+5.527592 
## [421]    train-rmse:40.496306+0.671839   test-rmse:47.657982+5.485309 
## [441]    train-rmse:39.104087+0.681247   test-rmse:46.638588+5.434602 
## [461]    train-rmse:37.844241+0.677333   test-rmse:45.744898+5.395461 
## [481]    train-rmse:36.740656+0.677980   test-rmse:44.970872+5.343011 
## [501]    train-rmse:35.750957+0.676105   test-rmse:44.295341+5.288149 
## [521]    train-rmse:34.891312+0.689270   test-rmse:43.743728+5.234937 
## [541]    train-rmse:34.103090+0.709530   test-rmse:43.233147+5.203919 
## [561]    train-rmse:33.406821+0.715545   test-rmse:42.798722+5.148838 
## [581]    train-rmse:32.795585+0.735327   test-rmse:42.430875+5.099515 
## [601]    train-rmse:32.237267+0.765588   test-rmse:42.128326+5.055134 
## [621]    train-rmse:31.736642+0.785292   test-rmse:41.855984+5.016137 
## [641]    train-rmse:31.290331+0.802890   test-rmse:41.615450+4.980566 
## [661]    train-rmse:30.876014+0.812018   test-rmse:41.391680+4.927557 
## [681]    train-rmse:30.510066+0.829420   test-rmse:41.211395+4.911014 
## [701]    train-rmse:30.151838+0.830699   test-rmse:41.016608+4.879565 
## [721]    train-rmse:29.831638+0.827219   test-rmse:40.865355+4.868538 
## [741]    train-rmse:29.549770+0.852200   test-rmse:40.744511+4.848055 
## [761]    train-rmse:29.270166+0.863540   test-rmse:40.617351+4.832612 
## [781]    train-rmse:29.002395+0.870984   test-rmse:40.489688+4.819068 
## [801]    train-rmse:28.770349+0.873786   test-rmse:40.379841+4.792393 
## [821]    train-rmse:28.543152+0.878570   test-rmse:40.283545+4.772872 
## [841]    train-rmse:28.333385+0.889542   test-rmse:40.201014+4.769705 
## [861]    train-rmse:28.132834+0.875737   test-rmse:40.118480+4.751033 
## [881]    train-rmse:27.951495+0.880850   test-rmse:40.042283+4.729199 
## [901]    train-rmse:27.766082+0.885420   test-rmse:39.968258+4.714211 
## [921]    train-rmse:27.585302+0.883028   test-rmse:39.895495+4.702939 
## [941]    train-rmse:27.413246+0.879127   test-rmse:39.836018+4.699596 
## [961]    train-rmse:27.239513+0.881716   test-rmse:39.779810+4.697129 
## [981]    train-rmse:27.078620+0.892377   test-rmse:39.737000+4.703278 
## [1000]   train-rmse:26.925898+0.881514   test-rmse:39.677833+4.703795
# eta plots

# Extract results for model with eta = 0.3
pd1 <- cbind.data.frame(bst_mod_1$evaluation_log[,c("iter", "test_rmse_mean")], rep(0.3, nrow(bst_mod_1$evaluation_log)))
names(pd1)[3] <- "eta"
# Extract results for model with eta = 0.1
pd2 <- cbind.data.frame(bst_mod_2$evaluation_log[,c("iter", "test_rmse_mean")], rep(0.1, nrow(bst_mod_2$evaluation_log)))
names(pd2)[3] <- "eta"
# Extract results for model with eta = 0.05
pd3 <- cbind.data.frame(bst_mod_3$evaluation_log[,c("iter", "test_rmse_mean")], rep(0.05, nrow(bst_mod_3$evaluation_log)))
names(pd3)[3] <- "eta"
# Extract results for model with eta = 0.01
pd4 <- cbind.data.frame(bst_mod_4$evaluation_log[,c("iter", "test_rmse_mean")], rep(0.01, nrow(bst_mod_4$evaluation_log)))
names(pd4)[3] <- "eta"
# Extract results for model with eta = 0.005
pd5 <- cbind.data.frame(bst_mod_5$evaluation_log[,c("iter", "test_rmse_mean")], rep(0.005, nrow(bst_mod_5$evaluation_log)))
names(pd5)[3] <- "eta"
# Join datasets
plot_data <- rbind.data.frame(pd1, pd2, pd3, pd4, pd5)
# Converty ETA to factor
plot_data$eta <- as.factor(plot_data$eta)
# Plot points
g_6 <- ggplot(plot_data, aes(x = iter, y = test_rmse_mean, color = eta))+
  geom_point(alpha = 0.5) +
  theme_bw() + # Set theme
  theme(panel.grid.major = element_blank(), # Remove grid
        panel.grid.minor = element_blank(), # Remove grid
        panel.border = element_blank(), # Remove grid
        panel.background = element_blank()) + # Remove grid 
  labs(x = "Number of Trees", title = "RMSE v Number of Trees",
       y = "RMSE", color = "Learning \n Rate")  # Set labels
g_6

# Plot lines
g_7 <- ggplot(plot_data, aes(x = iter, y = test_rmse_mean, color = eta))+
  geom_smooth(alpha = 0.5) +
  theme_bw() + # Set theme
  theme(panel.grid.major = element_blank(), # Remove grid
        panel.grid.minor = element_blank(), # Remove grid
        panel.border = element_blank(), # Remove grid
        panel.background = element_blank()) + # Remove grid 
  labs(x = "Number of Trees", title = "RMSE v Number of Trees",
       y = "RMSE", color = "Learning \n Rate")  # Set labels
g_7
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

# Use xgb.cv to run cross-validation inside xgboost
set.seed(111111)
bst_mod_final <- xgboost(data = dtrain, # Set training data
              
        
               
               eta = 0.1, # Set learning rate
                        max.depth = 5, # Set max depth
                     min_child_weight = 5, # Set minimum number of samples in node to split
                     gamma = 0, # Set minimum loss reduction for split
                     subsample = 0.7, # Set proportion of training data to use in tree
                     colsample_bytree = 0.9, # Set number of variables to use in each tree
               nrounds = 243, # Set number of rounds
               early_stopping_rounds = 50, # Set number of rounds to stop at if there is no improvement
               
               verbose = 1, # 1 - Prints out fit
               nthread = 1, # Set number of parallel threads
               print_every_n = 20 # Prints out result every 20th iteration
  ) # Set evaluation metric to use
## [1]  train-rmse:155.776041 
## Will train until train_rmse hasn't improved in 50 rounds.
## 
## [21] train-rmse:39.900436 
## [41] train-rmse:29.229977 
## [61] train-rmse:26.590835 
## [81] train-rmse:24.482098 
## [101]    train-rmse:22.712217 
## [121]    train-rmse:21.010145 
## [141]    train-rmse:19.606408 
## [161]    train-rmse:18.283559 
## [181]    train-rmse:17.108097 
## [201]    train-rmse:15.869291 
## [221]    train-rmse:14.740592 
## [241]    train-rmse:13.799385 
## [243]    train-rmse:13.688852
bst_predsf <- predict(bst_mod_final , dtest)
print(accuracy(bst_predsf, test_data$tc))
##                 ME     RMSE      MAE       MPE     MAPE
## Test set 0.2126129 35.10928 21.65655 -3.955761 14.16861

RMSE has gone down

Produce final model

plot_dat <- cbind.data.frame(bst_predsf, test_data$tc)
names(plot_dat) <- c("predicted", "actual")

ggplot(plot_dat, aes ( x = predicted, y  = actual)) +
  geom_point() +
  geom_smooth() +
  xlim(-100, 850) +
  ylim(-100, 850) +
  geom_abline(slope = 1, linetype = 2)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Extract variable importance

# Extract importance
imp_mat <- xgb.importance(model = bst_mod_final) #what is the model here?
# Plot importance (top 10 variables)
xgb.plot.importance(imp_mat, top_n = 10)

Interpretation

cres and csor have the highest importance scores, indicating that these features contribute the most to the total cost predictions. These variables likely have strong predictive power or influence over the target variable

## SHAP

# Calculate SHAP importance
shap_result <- shap.score.rank(xgb_model = bst_mod_final, 
                X_train =as.matrix(train_dummy),
                shap_approx = F)
## make SHAP score by decreasing order

tuned model

shap_long = shap.prep(shap = shap_result,
                           X_train = as.matrix(train_dummy), 
                           top_n = 10)


plot.shap.summary(data_long = shap_long)

Trying with cres and csor dropped

dtrain <- xgb.DMatrix(data = as.matrix(train_dummy[, 3:ncol(train_dummy)]), label = as.numeric(train_data$tc))
# Create test matrix
dtest <- xgb.DMatrix(data = as.matrix(test_dummy[, 3:ncol(train_dummy)]), label = as.numeric(test_data$tc))

XGBoost

set.seed(111111)
bst_1 <- xgboost(data = dtrain, # Set training data
               
               nrounds = 100, # Set number of rounds
               
               verbose = 1, # 1 - Prints out fit
                print_every_n = 20 # Prints out result every 20th iteration
               
        ) # Set evaluation metric to use
## [1]  train-rmse:126.003442 
## [21] train-rmse:22.859882 
## [41] train-rmse:15.842871 
## [61] train-rmse:11.735085 
## [81] train-rmse:8.815739 
## [100]    train-rmse:6.715222
bst_preds <- predict(bst_1, dtest)
print(accuracy(bst_preds, test_data$tc))
##                ME     RMSE      MAE       MPE     MAPE
## Test set 1.338493 50.34017 30.71847 -5.845864 20.25521

MEA has gone down. However, RMSE has increased which might suggest underfitting

plot_dat <- cbind.data.frame(bst_preds, test_data$tc)
names(plot_dat) <- c("predicted", "actual")

ggplot(plot_dat, aes ( x = predicted, y  = actual)) +
  geom_point() +
  geom_smooth() +
  xlim(-100, 850) +
  ylim(-100, 850) +
  geom_abline(slope = 1, linetype = 2)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Interpretation Even better predicition

Variable Importance

# Extract importance
imp_mat <- xgb.importance(model = bst_1)
# Plot importance (top 10 variables)
xgb.plot.importance(imp_mat, top_n = 10)

Interpretation

After removing cres and csors the feature that contributes the most to the total cost predictions is gdp.

source("~/Downloads/a_insights_shap_functions.r")

SHAP

# Calculate SHAP importance
shap_result <- shap.score.rank(xgb_model = bst_1, 
                X_train =as.matrix(train_dummy[, 3:ncol(train_dummy)]),
                shap_approx = F)
## make SHAP score by decreasing order
shap_long = shap.prep(shap = shap_result,
                           X_train = as.matrix(train_dummy[, 3:ncol(train_dummy)]), 
                           top_n = 10)


plot.shap.summary(data_long = shap_long)